Eric Blake
2022-May-31 14:15 UTC
[Libguestfs] [libnbd PATCH 2/2] python: Optimize away copy during pread_structured
The Py_BuildValue "y#" format copies data; this is because Python wants to allow our C memory to go out of scope without worrying about whether the user's python callback has stashed off a longer-living reference to its incoming parameter. But it is inefficient; we can do better by utilizing Python's memoryview for a zero-copy exposure to the callback's C buffer, as well as a .release method that we can utilize just before our C memory goes out of scope. Now, if the user stashes away a reference, they will get a clean Python error if they try to access the memory after the fact. This IS an API change (code previously expecting a stashed copy to be long-lived will break), but we never promised Python API stability, and anyone writing a callback that saves off data was already risky (neither libnbd nor nbdkit's testsuite had such a case). For a demonstration of the new error, where the old code succeeded: $ ./run nbdsh nbd> h.connect_command(["nbdkit", "-s", "memory", "10"]) nbd> copy=None nbd> def f(b,o,s,e): ... global copy ... copy = b ... print(b[0]) ... nbd> print(copy) None nbd> h.pread_structured(10,0,f) 0 bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00') nbd> copy <released memory at 0x7ff97410de40> nbd> copy[0] Traceback (most recent call last): File "/usr/lib64/python3.10/code.py", line 90, in runcode exec(code, self.locals) File "<console>", line 1, in <module> ValueError: operation forbidden on released memoryview object To demonstrate the speedup, I tested: $ export script=' def f(b,o,s,e): pass m=1024*1024 size=h.get_size() for i in range(size // m): buf = h.pread_structured(m, m*i, f) ' $ time ./run nbdkit -U - memory 10G --run 'nbdsh -u "$uri" -c "$script"' On my machine, this took 9.1s pre-patch, and 3.0s post-patch, for an approximate 65% speedup. The corresponding diff to the generated code is: | --- python/methods.c.bak 2022-05-31 07:57:25.256293091 -0500 | +++ python/methods.c 2022-05-31 08:14:09.570567858 -0500 | @@ -73,8 +73,12 @@ chunk_wrapper (void *user_data, const vo | | PyGILState_STATE py_save = PyGILState_UNLOCKED; | PyObject *py_args, *py_ret; | + PyObject *py_subbuf = NULL; | PyObject *py_error = NULL; | | + /* Cast away const */ | + py_subbuf = PyMemoryView_FromMemory ((char *) subbuf, count, PyBUF_READ); | + if (!py_subbuf) { PyErr_PrintEx (0); goto out; } | PyObject *py_error_modname = PyUnicode_FromString ("ctypes"); | if (!py_error_modname) { PyErr_PrintEx (0); goto out; } | PyObject *py_error_mod = PyImport_Import (py_error_modname); | @@ -84,7 +88,7 @@ | Py_DECREF (py_error_mod); | if (!py_error) { PyErr_PrintEx (0); goto out; } | | - py_args = Py_BuildValue ("(" "y#" "K" "I" "O" ")", subbuf, (int) count, offset, status, py_error); | + py_args = Py_BuildValue ("(" "O" "K" "I" "O" ")", py_subbuf, offset, status, py_error); | if (!py_args) { PyErr_PrintEx (0); goto out; } | | py_save = PyGILState_Ensure (); | @@ -111,6 +115,11 @@ chunk_wrapper (void *user_data, const vo | }; | | out: | + if (py_subbuf) { | + PyObject *tmp = PyObject_CallMethod(py_subbuf, "release", NULL); | + Py_XDECREF (tmp); | + Py_DECREF (py_subbuf); | + } | if (py_error) { | PyObject *py_error_ret = PyObject_GetAttrString (py_error, "value"); | *error = PyLong_AsLong (py_error_ret); --- generator/Python.ml | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/generator/Python.ml b/generator/Python.ml index 1c4446e..0191f79 100644 --- a/generator/Python.ml +++ b/generator/Python.ml @@ -169,8 +169,7 @@ let pr " PyObject *py_args, *py_ret;\n"; List.iter ( function - | CBArrayAndLen (UInt32 n, _) -> - pr " PyObject *py_%s = NULL;\n" n + | CBArrayAndLen (UInt32 n, _) | CBBytesIn (n, _) | CBMutable (Int n) -> pr " PyObject *py_%s = NULL;\n" n | _ -> () @@ -187,7 +186,10 @@ let pr " if (!py_e_%s) { PyErr_PrintEx (0); goto out; }\n" n; pr " PyList_SET_ITEM (py_%s, i_%s, py_e_%s);\n" n n n; pr " }\n" - | CBBytesIn _ + | CBBytesIn (n, len) -> + pr " /* Cast away const */\n"; + pr " py_%s = PyMemoryView_FromMemory ((char *) %s, %s, PyBUF_READ);\n" n n len; + pr " if (!py_%s) { PyErr_PrintEx (0); goto out; }\n" n | CBInt _ | CBInt64 _ -> () | CBMutable (Int n) -> @@ -210,7 +212,7 @@ let List.iter ( function | CBArrayAndLen (UInt32 n, len) -> pr " \"O\"" - | CBBytesIn (n, len) -> pr " \"y#\"" + | CBBytesIn (n, _) -> pr " \"O\"" | CBInt n -> pr " \"i\"" | CBInt64 n -> pr " \"L\"" | CBMutable (Int n) -> pr " \"O\"" @@ -223,7 +225,7 @@ let List.iter ( function | CBArrayAndLen (UInt32 n, _) -> pr ", py_%s" n - | CBBytesIn (n, len) -> pr ", %s, (int) %s" n len + | CBBytesIn (n, _) -> pr ", py_%s" n | CBMutable (Int n) -> pr ", py_%s" n | CBInt n | CBInt64 n | CBString n @@ -268,7 +270,13 @@ let pr " Py_DECREF (py_%s_ret);\n" n; pr " Py_DECREF (py_%s);\n" n; pr " }\n" - | CBBytesIn _ + | CBBytesIn (n, _) -> + (* https://stackoverflow.com/questions/58870010/how-to-call-release-on-a-memoryview-in-python-c-api *) + pr " if (py_%s) {\n" n; + pr " PyObject *tmp = PyObject_CallMethod(py_%s, \"release\", NULL);\n" n; + pr " Py_XDECREF (tmp);\n"; + pr " Py_DECREF (py_%s);\n" n; + pr " }\n" | CBInt _ | CBInt64 _ | CBString _ | CBUInt _ | CBUInt64 _ -> () -- 2.36.1
Nir Soffer
2022-May-31 20:12 UTC
[Libguestfs] [libnbd PATCH 2/2] python: Optimize away copy during pread_structured
On Tue, May 31, 2022 at 5:15 PM Eric Blake <eblake at redhat.com> wrote:> > The Py_BuildValue "y#" format copies data; this is because Python > wants to allow our C memory to go out of scope without worrying about > whether the user's python callback has stashed off a longer-living > reference to its incoming parameter. But it is inefficient; we can do > better by utilizing Python's memoryview for a zero-copy exposure to > the callback's C buffer, as well as a .release method that we can > utilize just before our C memory goes out of scope. Now, if the user > stashes away a reference, they will get a clean Python error if they > try to access the memory after the fact. This IS an API change (code > previously expecting a stashed copy to be long-lived will break), but > we never promised Python API stability, and anyone writing a callback > that saves off data was already risky (neither libnbd nor nbdkit's > testsuite had such a case). For a demonstration of the new error, > where the old code succeeded: > > $ ./run nbdsh > nbd> h.connect_command(["nbdkit", "-s", "memory", "10"]) > nbd> copy=None > nbd> def f(b,o,s,e): > ... global copy > ... copy = b > ... print(b[0]) > ... > nbd> print(copy) > None > nbd> h.pread_structured(10,0,f) > 0 > bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00') > nbd> copy > <released memory at 0x7ff97410de40> > nbd> copy[0] > Traceback (most recent call last): > File "/usr/lib64/python3.10/code.py", line 90, in runcode > exec(code, self.locals) > File "<console>", line 1, in <module> > ValueError: operation forbidden on released memoryview object > > To demonstrate the speedup, I tested: > > $ export script=' > def f(b,o,s,e): > pass > m=1024*1024 > size=h.get_size() > for i in range(size // m): > buf = h.pread_structured(m, m*i, f) > ' > $ time ./run nbdkit -U - memory 10G --run 'nbdsh -u "$uri" -c "$script"' > > On my machine, this took 9.1s pre-patch, and 3.0s post-patch, for an > approximate 65% speedup.Looks like ~300% speedup to me. I guess 3 times faster is the most clear way to describe the change.> The corresponding diff to the generated code is: > > | --- python/methods.c.bak 2022-05-31 07:57:25.256293091 -0500 > | +++ python/methods.c 2022-05-31 08:14:09.570567858 -0500 > | @@ -73,8 +73,12 @@ chunk_wrapper (void *user_data, const vo > | > | PyGILState_STATE py_save = PyGILState_UNLOCKED; > | PyObject *py_args, *py_ret; > | + PyObject *py_subbuf = NULL; > | PyObject *py_error = NULL; > | > | + /* Cast away const */I think it will be more clear and helpful as: /* Casting subbuf to char* is safe since we use PyBUF_READ. */> | + py_subbuf = PyMemoryView_FromMemory ((char *) subbuf, count, PyBUF_READ);Maybe py_view? And also change the doscring to mention "view" instead of "subbuf".> | + if (!py_subbuf) { PyErr_PrintEx (0); goto out; } > | PyObject *py_error_modname = PyUnicode_FromString ("ctypes"); > | if (!py_error_modname) { PyErr_PrintEx (0); goto out; } > | PyObject *py_error_mod = PyImport_Import (py_error_modname); > | @@ -84,7 +88,7 @@ > | Py_DECREF (py_error_mod); > | if (!py_error) { PyErr_PrintEx (0); goto out; } > | > | - py_args = Py_BuildValue ("(" "y#" "K" "I" "O" ")", subbuf, (int) count, offset, status, py_error); > | + py_args = Py_BuildValue ("(" "O" "K" "I" "O" ")", py_subbuf, offset, status, py_error); > | if (!py_args) { PyErr_PrintEx (0); goto out; } > | > | py_save = PyGILState_Ensure (); > | @@ -111,6 +115,11 @@ chunk_wrapper (void *user_data, const vo > | }; > | > | out: > | + if (py_subbuf) { > | + PyObject *tmp = PyObject_CallMethod(py_subbuf, "release", NULL);What it the user does: def f(b,o,s,e): use b... b.release() Or: def f(b,o,s,e): with b: use b... We would release a released memoryview here. I don't think this is likely to happen though.> | + Py_XDECREF (tmp); > | + Py_DECREF (py_subbuf); > | + } > | if (py_error) { > | PyObject *py_error_ret = PyObject_GetAttrString (py_error, "value"); > | *error = PyLong_AsLong (py_error_ret); > --- > generator/Python.ml | 20 ++++++++++++++------ > 1 file changed, 14 insertions(+), 6 deletions(-) > > diff --git a/generator/Python.ml b/generator/Python.ml > index 1c4446e..0191f79 100644 > --- a/generator/Python.ml > +++ b/generator/Python.ml > @@ -169,8 +169,7 @@ let > pr " PyObject *py_args, *py_ret;\n"; > List.iter ( > function > - | CBArrayAndLen (UInt32 n, _) -> > - pr " PyObject *py_%s = NULL;\n" n > + | CBArrayAndLen (UInt32 n, _) | CBBytesIn (n, _) > | CBMutable (Int n) -> > pr " PyObject *py_%s = NULL;\n" n > | _ -> () > @@ -187,7 +186,10 @@ let > pr " if (!py_e_%s) { PyErr_PrintEx (0); goto out; }\n" n; > pr " PyList_SET_ITEM (py_%s, i_%s, py_e_%s);\n" n n n; > pr " }\n" > - | CBBytesIn _ > + | CBBytesIn (n, len) -> > + pr " /* Cast away const */\n"; > + pr " py_%s = PyMemoryView_FromMemory ((char *) %s, %s, PyBUF_READ);\n" n n len; > + pr " if (!py_%s) { PyErr_PrintEx (0); goto out; }\n" n > | CBInt _ > | CBInt64 _ -> () > | CBMutable (Int n) -> > @@ -210,7 +212,7 @@ let > List.iter ( > function > | CBArrayAndLen (UInt32 n, len) -> pr " \"O\"" > - | CBBytesIn (n, len) -> pr " \"y#\"" > + | CBBytesIn (n, _) -> pr " \"O\"" > | CBInt n -> pr " \"i\"" > | CBInt64 n -> pr " \"L\"" > | CBMutable (Int n) -> pr " \"O\"" > @@ -223,7 +225,7 @@ let > List.iter ( > function > | CBArrayAndLen (UInt32 n, _) -> pr ", py_%s" n > - | CBBytesIn (n, len) -> pr ", %s, (int) %s" n len > + | CBBytesIn (n, _) -> pr ", py_%s" n > | CBMutable (Int n) -> pr ", py_%s" n > | CBInt n | CBInt64 n > | CBString n > @@ -268,7 +270,13 @@ let > pr " Py_DECREF (py_%s_ret);\n" n; > pr " Py_DECREF (py_%s);\n" n; > pr " }\n" > - | CBBytesIn _ > + | CBBytesIn (n, _) -> > + (* https://stackoverflow.com/questions/58870010/how-to-call-release-on-a-memoryview-in-python-c-api *) > + pr " if (py_%s) {\n" n; > + pr " PyObject *tmp = PyObject_CallMethod(py_%s, \"release\", NULL);\n" n; > + pr " Py_XDECREF (tmp);\n"; > + pr " Py_DECREF (py_%s);\n" n; > + pr " }\n" > | CBInt _ | CBInt64 _ > | CBString _ > | CBUInt _ | CBUInt64 _ -> () > -- > 2.36.1I'm not sure about this change - the performance improvement is great, but the API change is surprising. Since pread_structured() returns a buffer with the read data, can we arrange that the memoryview argument to the chunk callback is pointing into the returned buffer, so the view does not have to be released by the C extension? Nir