diff mbox series

[v7,1/5] block/io: fix bdrv_co_block_status_above

Message ID 20200924194003.22080-2-vsementsov@virtuozzo.com
State Superseded
Headers show
Series fix & merge block_status_above and is_allocated_above | expand

Commit Message

Vladimir Sementsov-Ogievskiy Sept. 24, 2020, 7:39 p.m. UTC
bdrv_co_block_status_above has several design problems with handling
short backing files:

1. With want_zeros=true, it may return ret with BDRV_BLOCK_ZERO but
without BDRV_BLOCK_ALLOCATED flag, when actually short backing file
which produces these after-EOF zeros is inside requested backing
sequence.

2. With want_zero=false, it may return pnum=0 prior to actual EOF,
because of EOF of short backing file.

Fix these things, making logic about short backing files clearer.

With fixed bdrv_block_status_above we also have to improve is_zero in
qcow2 code, otherwise iotest 154 will fail, because with this patch we
stop to merge zeros of different types (produced by fully unallocated
in the whole backing chain regions vs produced by short backing files).

Note also, that this patch leaves for another day the general problem
around block-status: misuse of BDRV_BLOCK_ALLOCATED as is-fs-allocated
vs go-to-backing.

Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>
Reviewed-by: Alberto Garcia <berto@igalia.com>
---
 block/io.c    | 68 ++++++++++++++++++++++++++++++++++++++++-----------
 block/qcow2.c | 16 ++++++++++--
 2 files changed, 68 insertions(+), 16 deletions(-)

Comments

Eric Blake Sept. 24, 2020, 8:01 p.m. UTC | #1
On 9/24/20 2:39 PM, Vladimir Sementsov-Ogievskiy wrote:
> bdrv_co_block_status_above has several design problems with handling

> short backing files:

> 

> 1. With want_zeros=true, it may return ret with BDRV_BLOCK_ZERO but

> without BDRV_BLOCK_ALLOCATED flag, when actually short backing file

> which produces these after-EOF zeros is inside requested backing

> sequence.

> 

> 2. With want_zero=false, it may return pnum=0 prior to actual EOF,

> because of EOF of short backing file.

> 

> Fix these things, making logic about short backing files clearer.

> 

> With fixed bdrv_block_status_above we also have to improve is_zero in

> qcow2 code, otherwise iotest 154 will fail, because with this patch we

> stop to merge zeros of different types (produced by fully unallocated

> in the whole backing chain regions vs produced by short backing files).

> 

> Note also, that this patch leaves for another day the general problem

> around block-status: misuse of BDRV_BLOCK_ALLOCATED as is-fs-allocated

> vs go-to-backing.

> 

> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsementsov@virtuozzo.com>

> Reviewed-by: Alberto Garcia <berto@igalia.com>

> ---

>   block/io.c    | 68 ++++++++++++++++++++++++++++++++++++++++-----------

>   block/qcow2.c | 16 ++++++++++--

>   2 files changed, 68 insertions(+), 16 deletions(-)

> 

> diff --git a/block/io.c b/block/io.c

> index 449b99b92c..4697e67a85 100644

> --- a/block/io.c

> +++ b/block/io.c

> @@ -2350,34 +2350,74 @@ bdrv_co_common_block_status_above(BlockDriverState *bs,

>                                     int64_t *map,

>                                     BlockDriverState **file)

>   {

> +    int ret;

>       BlockDriverState *p;

> -    int ret = 0;


The shuffle of these lines is odd, but not a show-stopper.

The new flow is definitely easier to read.  Thanks for doing this!

> +++ b/block/qcow2.c

> @@ -3860,8 +3860,20 @@ static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)

>       if (!bytes) {

>           return true;

>       }

> -    res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);

> -    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;

> +

> +    /*

> +     * bdrv_block_status_above doesn't merge different types of zeros, for


We're inconsistent on whether we spell the noun 'zeros' or 'zeroes'; but 
I don't care enough to make you change this.

> +     * example, zeros which come from the region which is unallocated in

> +     * the whole backing chain, and zeros which comes because of a short


s/comes/come/

> +     * backing file. So, we need a loop.

> +     */

> +    do {

> +        res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);

> +        offset += nr;

> +        bytes -= nr;

> +    } while (res >= 0 && (res & BDRV_BLOCK_ZERO) && nr && bytes);

> +

> +    return res >= 0 && (res & BDRV_BLOCK_ZERO) && bytes == 0;

>   }

>   

>   static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,

> 


Reviewed-by: Eric Blake <eblake@redhat.com>


-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.           +1-919-301-3226
Virtualization:  qemu.org | libvirt.org
diff mbox series

Patch

diff --git a/block/io.c b/block/io.c
index 449b99b92c..4697e67a85 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2350,34 +2350,74 @@  bdrv_co_common_block_status_above(BlockDriverState *bs,
                                   int64_t *map,
                                   BlockDriverState **file)
 {
+    int ret;
     BlockDriverState *p;
-    int ret = 0;
-    bool first = true;
+    int64_t eof = 0;
 
     assert(bs != base);
-    for (p = bs; p != base; p = bdrv_filter_or_cow_bs(p)) {
+
+    ret = bdrv_co_block_status(bs, want_zero, offset, bytes, pnum, map, file);
+    if (ret < 0 || *pnum == 0 || ret & BDRV_BLOCK_ALLOCATED) {
+        return ret;
+    }
+
+    if (ret & BDRV_BLOCK_EOF) {
+        eof = offset + *pnum;
+    }
+
+    assert(*pnum <= bytes);
+    bytes = *pnum;
+
+    for (p = bdrv_filter_or_cow_bs(bs); p != base;
+         p = bdrv_filter_or_cow_bs(p))
+    {
         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
                                    file);
         if (ret < 0) {
-            break;
+            return ret;
         }
-        if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
+        if (*pnum == 0) {
             /*
-             * Reading beyond the end of the file continues to read
-             * zeroes, but we can only widen the result to the
-             * unallocated length we learned from an earlier
-             * iteration.
+             * The top layer deferred to this layer, and because this layer is
+             * short, any zeroes that we synthesize beyond EOF behave as if they
+             * were allocated at this layer.
+             *
+             * We don't include BDRV_BLOCK_EOF into ret, as upper layer may be
+             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
+             * below.
              */
+            assert(ret & BDRV_BLOCK_EOF);
             *pnum = bytes;
+            if (file) {
+                *file = p;
+            }
+            ret = BDRV_BLOCK_ZERO | BDRV_BLOCK_ALLOCATED;
+            break;
         }
-        if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
+        if (ret & BDRV_BLOCK_ALLOCATED) {
+            /*
+             * We've found the node and the status, we must break.
+             *
+             * Drop BDRV_BLOCK_EOF, as it's not for upper layer, which may be
+             * larger. We'll add BDRV_BLOCK_EOF if needed at function end, see
+             * below.
+             */
+            ret &= ~BDRV_BLOCK_EOF;
             break;
         }
-        /* [offset, pnum] unallocated on this layer, which could be only
-         * the first part of [offset, bytes].  */
-        bytes = MIN(bytes, *pnum);
-        first = false;
+
+        /*
+         * OK, [offset, offset + *pnum) region is unallocated on this layer,
+         * let's continue the diving.
+         */
+        assert(*pnum <= bytes);
+        bytes = *pnum;
     }
+
+    if (offset + *pnum == eof) {
+        ret |= BDRV_BLOCK_EOF;
+    }
+
     return ret;
 }
 
diff --git a/block/qcow2.c b/block/qcow2.c
index b05512718c..a1bc16e202 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3860,8 +3860,20 @@  static bool is_zero(BlockDriverState *bs, int64_t offset, int64_t bytes)
     if (!bytes) {
         return true;
     }
-    res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
-    return res >= 0 && (res & BDRV_BLOCK_ZERO) && nr == bytes;
+
+    /*
+     * bdrv_block_status_above doesn't merge different types of zeros, for
+     * example, zeros which come from the region which is unallocated in
+     * the whole backing chain, and zeros which comes because of a short
+     * backing file. So, we need a loop.
+     */
+    do {
+        res = bdrv_block_status_above(bs, NULL, offset, bytes, &nr, NULL, NULL);
+        offset += nr;
+        bytes -= nr;
+    } while (res >= 0 && (res & BDRV_BLOCK_ZERO) && nr && bytes);
+
+    return res >= 0 && (res & BDRV_BLOCK_ZERO) && bytes == 0;
 }
 
 static coroutine_fn int qcow2_co_pwrite_zeroes(BlockDriverState *bs,