Initial commit: add .gitignore and README

2026-02-09 21:51:42 -08:00
commit c052b07662
3146 changed files with 808305 additions and 0 deletions
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/CPy.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/CPy.h
@@ -0,0 +1,975 @@
+// Mypyc C API
+
+#ifndef CPY_CPY_H
+#define CPY_CPY_H
+
+#include <stdbool.h>
+#include <Python.h>
+#include <frameobject.h>
+#include <structmember.h>
+#include <assert.h>
+#include <stdint.h>
+#include "pythonsupport.h"
+#include "mypyc_util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} // why isn't emacs smart enough to not indent this
+#endif
+
+#define CPYTHON_LARGE_INT_ERRMSG "Python int too large to convert to C ssize_t"
+
+
+// Naming conventions:
+//
+// Tagged: tagged int
+// Long: tagged long int (pointer)
+// Short: tagged short int (unboxed)
+// Ssize_t: A Py_ssize_t, which ought to be the same width as pointers
+// Object: CPython object (PyObject *)
+
+
+// Tuple type definitions needed for API functions
+
+
+#ifndef MYPYC_DECLARED_tuple_T3OOO
+#define MYPYC_DECLARED_tuple_T3OOO
+typedef struct tuple_T3OOO {
+    PyObject *f0;
+    PyObject *f1;
+    PyObject *f2;
+} tuple_T3OOO;
+#endif
+
+// Our return tuple wrapper for dictionary iteration helper.
+#ifndef MYPYC_DECLARED_tuple_T3CIO
+#define MYPYC_DECLARED_tuple_T3CIO
+typedef struct tuple_T3CIO {
+    char f0;  // Should continue?
+    CPyTagged f1;  // Last dict offset
+    PyObject *f2;  // Next dictionary key or value
+} tuple_T3CIO;
+#endif
+
+// Same as above but for both key and value.
+#ifndef MYPYC_DECLARED_tuple_T4CIOO
+#define MYPYC_DECLARED_tuple_T4CIOO
+typedef struct tuple_T4CIOO {
+    char f0;  // Should continue?
+    CPyTagged f1;  // Last dict offset
+    PyObject *f2;  // Next dictionary key
+    PyObject *f3;  // Next dictionary value
+} tuple_T4CIOO;
+#endif
+
+// System-wide empty tuple constant
+extern PyObject * __mypyc_empty_tuple__;
+
+static inline PyObject *CPyTuple_LoadEmptyTupleConstant(void) {
+#if !CPY_3_12_FEATURES
+    Py_INCREF(__mypyc_empty_tuple__);
+#endif
+    return __mypyc_empty_tuple__;
+}
+
+// Native object operations
+
+
+// Search backwards through the trait part of a vtable (which sits *before*
+// the start of the vtable proper) looking for the subvtable describing a trait
+// implementation. We don't do any bounds checking so we'd better be pretty sure
+// we know that it is there.
+static inline CPyVTableItem *CPy_FindTraitVtable(PyTypeObject *trait, CPyVTableItem *vtable) {
+    int i;
+    for (i = -3; ; i -= 3) {
+        if ((PyTypeObject *)vtable[i] == trait) {
+            return (CPyVTableItem *)vtable[i + 1];
+        }
+    }
+}
+
+// Use the same logic for offset table.
+static inline size_t CPy_FindAttrOffset(PyTypeObject *trait, CPyVTableItem *vtable, size_t index) {
+    int i;
+    for (i = -3; ; i -= 3) {
+        if ((PyTypeObject *)vtable[i] == trait) {
+            return ((size_t *)vtable[i + 2])[index];
+        }
+    }
+}
+
+// Get attribute value using vtable (may return an undefined value)
+#define CPY_GET_ATTR(obj, type, vtable_index, object_type, attr_type)    \
+    ((attr_type (*)(object_type *))((object_type *)obj)->vtable[vtable_index])((object_type *)obj)
+
+#define CPY_GET_ATTR_TRAIT(obj, trait, vtable_index, object_type, attr_type)   \
+    ((attr_type (*)(object_type *))(CPy_FindTraitVtable(trait, ((object_type *)obj)->vtable))[vtable_index])((object_type *)obj)
+
+// Set attribute value using vtable
+#define CPY_SET_ATTR(obj, type, vtable_index, value, object_type, attr_type) \
+    ((bool (*)(object_type *, attr_type))((object_type *)obj)->vtable[vtable_index])( \
+        (object_type *)obj, value)
+
+#define CPY_SET_ATTR_TRAIT(obj, trait, vtable_index, value, object_type, attr_type) \
+    ((bool (*)(object_type *, attr_type))(CPy_FindTraitVtable(trait, ((object_type *)obj)->vtable))[vtable_index])( \
+        (object_type *)obj, value)
+
+#define CPY_GET_METHOD(obj, type, vtable_index, object_type, method_type) \
+    ((method_type)(((object_type *)obj)->vtable[vtable_index]))
+
+#define CPY_GET_METHOD_TRAIT(obj, trait, vtable_index, object_type, method_type) \
+    ((method_type)(CPy_FindTraitVtable(trait, ((object_type *)obj)->vtable)[vtable_index]))
+
+
+// Int operations
+
+
+CPyTagged CPyTagged_FromSsize_t(Py_ssize_t value);
+CPyTagged CPyTagged_FromVoidPtr(void *ptr);
+CPyTagged CPyTagged_FromInt64(int64_t value);
+PyObject *CPyTagged_AsObject(CPyTagged x);
+PyObject *CPyTagged_StealAsObject(CPyTagged x);
+Py_ssize_t CPyTagged_AsSsize_t(CPyTagged x);
+void CPyTagged_IncRef(CPyTagged x);
+void CPyTagged_DecRef(CPyTagged x);
+void CPyTagged_XDecRef(CPyTagged x);
+
+bool CPyTagged_IsEq_(CPyTagged left, CPyTagged right);
+bool CPyTagged_IsLt_(CPyTagged left, CPyTagged right);
+CPyTagged CPyTagged_Negate_(CPyTagged num);
+CPyTagged CPyTagged_Invert_(CPyTagged num);
+CPyTagged CPyTagged_Add_(CPyTagged left, CPyTagged right);
+CPyTagged CPyTagged_Subtract_(CPyTagged left, CPyTagged right);
+CPyTagged CPyTagged_Multiply_(CPyTagged left, CPyTagged right);
+CPyTagged CPyTagged_FloorDivide_(CPyTagged left, CPyTagged right);
+CPyTagged CPyTagged_Remainder_(CPyTagged left, CPyTagged right);
+CPyTagged CPyTagged_BitwiseLongOp_(CPyTagged a, CPyTagged b, char op);
+CPyTagged CPyTagged_Rshift_(CPyTagged left, CPyTagged right);
+CPyTagged CPyTagged_Lshift_(CPyTagged left, CPyTagged right);
+CPyTagged CPyTagged_BitLength(CPyTagged self);
+
+PyObject *CPyTagged_Str(CPyTagged n);
+CPyTagged CPyTagged_FromFloat(double f);
+PyObject *CPyLong_FromStrWithBase(PyObject *o, CPyTagged base);
+PyObject *CPyLong_FromStr(PyObject *o);
+PyObject *CPyBool_Str(bool b);
+int64_t CPyLong_AsInt64_(PyObject *o);
+int64_t CPyInt64_Divide(int64_t x, int64_t y);
+int64_t CPyInt64_Remainder(int64_t x, int64_t y);
+int32_t CPyLong_AsInt32_(PyObject *o);
+int32_t CPyInt32_Divide(int32_t x, int32_t y);
+int32_t CPyInt32_Remainder(int32_t x, int32_t y);
+void CPyInt32_Overflow(void);
+int16_t CPyLong_AsInt16_(PyObject *o);
+int16_t CPyInt16_Divide(int16_t x, int16_t y);
+int16_t CPyInt16_Remainder(int16_t x, int16_t y);
+void CPyInt16_Overflow(void);
+uint8_t CPyLong_AsUInt8_(PyObject *o);
+void CPyUInt8_Overflow(void);
+double CPyTagged_TrueDivide(CPyTagged x, CPyTagged y);
+
+static inline int CPyTagged_CheckLong(CPyTagged x) {
+    return x & CPY_INT_TAG;
+}
+
+static inline int CPyTagged_CheckShort(CPyTagged x) {
+    return !CPyTagged_CheckLong(x);
+}
+
+static inline void CPyTagged_INCREF(CPyTagged x) {
+    if (unlikely(CPyTagged_CheckLong(x))) {
+        CPyTagged_IncRef(x);
+    }
+}
+
+static inline void CPyTagged_DECREF(CPyTagged x) {
+    if (unlikely(CPyTagged_CheckLong(x))) {
+        CPyTagged_DecRef(x);
+    }
+}
+
+static inline void CPyTagged_XDECREF(CPyTagged x) {
+    if (unlikely(CPyTagged_CheckLong(x))) {
+        CPyTagged_XDecRef(x);
+    }
+}
+
+static inline Py_ssize_t CPyTagged_ShortAsSsize_t(CPyTagged x) {
+    // NOTE: Assume that we sign extend.
+    return (Py_ssize_t)x >> 1;
+}
+
+static inline PyObject *CPyTagged_LongAsObject(CPyTagged x) {
+    // NOTE: Assume target is not a short int.
+    return (PyObject *)(x & ~CPY_INT_TAG);
+}
+
+static inline CPyTagged CPyTagged_FromObject(PyObject *object) {
+    int overflow;
+    // The overflow check knows about CPyTagged's width
+    Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow);
+    if (unlikely(overflow != 0)) {
+        Py_INCREF(object);
+        return ((CPyTagged)object) | CPY_INT_TAG;
+    } else {
+        return value << 1;
+    }
+}
+
+static inline CPyTagged CPyTagged_StealFromObject(PyObject *object) {
+    int overflow;
+    // The overflow check knows about CPyTagged's width
+    Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow);
+    if (unlikely(overflow != 0)) {
+        return ((CPyTagged)object) | CPY_INT_TAG;
+    } else {
+        Py_DECREF(object);
+        return value << 1;
+    }
+}
+
+static inline CPyTagged CPyTagged_BorrowFromObject(PyObject *object) {
+    int overflow;
+    // The overflow check knows about CPyTagged's width
+    Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow);
+    if (unlikely(overflow != 0)) {
+        return ((CPyTagged)object) | CPY_INT_TAG;
+    } else {
+        return value << 1;
+    }
+}
+
+static inline bool CPyTagged_TooBig(Py_ssize_t value) {
+    // Micro-optimized for the common case where it fits.
+    return (size_t)value > CPY_TAGGED_MAX
+        && (value >= 0 || value < CPY_TAGGED_MIN);
+}
+
+static inline bool CPyTagged_TooBigInt64(int64_t value) {
+    // Micro-optimized for the common case where it fits.
+    return (uint64_t)value > CPY_TAGGED_MAX
+        && (value >= 0 || value < CPY_TAGGED_MIN);
+}
+
+static inline bool CPyTagged_IsAddOverflow(CPyTagged sum, CPyTagged left, CPyTagged right) {
+    // This check was copied from some of my old code I believe that it works :-)
+    return (Py_ssize_t)(sum ^ left) < 0 && (Py_ssize_t)(sum ^ right) < 0;
+}
+
+static inline bool CPyTagged_IsSubtractOverflow(CPyTagged diff, CPyTagged left, CPyTagged right) {
+    // This check was copied from some of my old code I believe that it works :-)
+    return (Py_ssize_t)(diff ^ left) < 0 && (Py_ssize_t)(diff ^ right) >= 0;
+}
+
+static inline bool CPyTagged_IsMultiplyOverflow(CPyTagged left, CPyTagged right) {
+    // This is conservative -- return false only in a small number of all non-overflow cases
+    return left >= (1U << (CPY_INT_BITS/2 - 1)) || right >= (1U << (CPY_INT_BITS/2 - 1));
+}
+
+static inline bool CPyTagged_MaybeFloorDivideFault(CPyTagged left, CPyTagged right) {
+    return right == 0 || left == -((size_t)1 << (CPY_INT_BITS-1));
+}
+
+static inline bool CPyTagged_MaybeRemainderFault(CPyTagged left, CPyTagged right) {
+    // Division/modulus can fault when dividing INT_MIN by -1, but we
+    // do our mods on still-tagged integers with the low-bit clear, so
+    // -1 is actually represented as -2 and can't overflow.
+    // Mod by 0 can still fault though.
+    return right == 0;
+}
+
+static inline bool CPyTagged_IsEq(CPyTagged left, CPyTagged right) {
+    if (CPyTagged_CheckShort(left)) {
+        return left == right;
+    } else {
+        return CPyTagged_IsEq_(left, right);
+    }
+}
+
+static inline bool CPyTagged_IsNe(CPyTagged left, CPyTagged right) {
+    if (CPyTagged_CheckShort(left)) {
+        return left != right;
+    } else {
+        return !CPyTagged_IsEq_(left, right);
+    }
+}
+
+static inline bool CPyTagged_IsLt(CPyTagged left, CPyTagged right) {
+    if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
+        return (Py_ssize_t)left < (Py_ssize_t)right;
+    } else {
+        return CPyTagged_IsLt_(left, right);
+    }
+}
+
+static inline bool CPyTagged_IsGe(CPyTagged left, CPyTagged right) {
+    if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
+        return (Py_ssize_t)left >= (Py_ssize_t)right;
+    } else {
+        return !CPyTagged_IsLt_(left, right);
+    }
+}
+
+static inline bool CPyTagged_IsGt(CPyTagged left, CPyTagged right) {
+    if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
+        return (Py_ssize_t)left > (Py_ssize_t)right;
+    } else {
+        return CPyTagged_IsLt_(right, left);
+    }
+}
+
+static inline bool CPyTagged_IsLe(CPyTagged left, CPyTagged right) {
+    if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
+        return (Py_ssize_t)left <= (Py_ssize_t)right;
+    } else {
+        return !CPyTagged_IsLt_(right, left);
+    }
+}
+
+static inline int64_t CPyLong_AsInt64(PyObject *o) {
+    if (likely(PyLong_Check(o))) {
+        PyLongObject *lobj = (PyLongObject *)o;
+        Py_ssize_t size = Py_SIZE(lobj);
+        if (likely(size == 1)) {
+            // Fast path
+            return CPY_LONG_DIGIT(lobj, 0);
+        } else if (likely(size == 0)) {
+            return 0;
+        }
+    }
+    // Slow path
+    return CPyLong_AsInt64_(o);
+}
+
+static inline int32_t CPyLong_AsInt32(PyObject *o) {
+    if (likely(PyLong_Check(o))) {
+    #if CPY_3_12_FEATURES
+        PyLongObject *lobj = (PyLongObject *)o;
+        size_t tag = CPY_LONG_TAG(lobj);
+        if (likely(tag == (1 << CPY_NON_SIZE_BITS))) {
+            // Fast path
+            return CPY_LONG_DIGIT(lobj, 0);
+        } else if (likely(tag == CPY_SIGN_ZERO)) {
+            return 0;
+        }
+    #else
+        PyLongObject *lobj = (PyLongObject *)o;
+        Py_ssize_t size = lobj->ob_base.ob_size;
+        if (likely(size == 1)) {
+            // Fast path
+            return CPY_LONG_DIGIT(lobj, 0);
+        } else if (likely(size == 0)) {
+            return 0;
+        }
+    #endif
+    }
+    // Slow path
+    return CPyLong_AsInt32_(o);
+}
+
+static inline int16_t CPyLong_AsInt16(PyObject *o) {
+    if (likely(PyLong_Check(o))) {
+    #if CPY_3_12_FEATURES
+        PyLongObject *lobj = (PyLongObject *)o;
+        size_t tag = CPY_LONG_TAG(lobj);
+        if (likely(tag == (1 << CPY_NON_SIZE_BITS))) {
+            // Fast path
+            digit x = CPY_LONG_DIGIT(lobj, 0);
+            if (x < 0x8000)
+                return x;
+        } else if (likely(tag == CPY_SIGN_ZERO)) {
+            return 0;
+        }
+    #else
+        PyLongObject *lobj = (PyLongObject *)o;
+        Py_ssize_t size = lobj->ob_base.ob_size;
+        if (likely(size == 1)) {
+            // Fast path
+            digit x = lobj->ob_digit[0];
+            if (x < 0x8000)
+                return x;
+        } else if (likely(size == 0)) {
+            return 0;
+        }
+    #endif
+    }
+    // Slow path
+    return CPyLong_AsInt16_(o);
+}
+
+static inline uint8_t CPyLong_AsUInt8(PyObject *o) {
+    if (likely(PyLong_Check(o))) {
+    #if CPY_3_12_FEATURES
+        PyLongObject *lobj = (PyLongObject *)o;
+        size_t tag = CPY_LONG_TAG(lobj);
+        if (likely(tag == (1 << CPY_NON_SIZE_BITS))) {
+            // Fast path
+            digit x = CPY_LONG_DIGIT(lobj, 0);
+            if (x < 256)
+                return x;
+        } else if (likely(tag == CPY_SIGN_ZERO)) {
+            return 0;
+        }
+    #else
+        PyLongObject *lobj = (PyLongObject *)o;
+        Py_ssize_t size = lobj->ob_base.ob_size;
+        if (likely(size == 1)) {
+            // Fast path
+            digit x = lobj->ob_digit[0];
+            if (x < 256)
+                return x;
+        } else if (likely(size == 0)) {
+            return 0;
+        }
+    #endif
+    }
+    // Slow path
+    return CPyLong_AsUInt8_(o);
+}
+
+static inline CPyTagged CPyTagged_Negate(CPyTagged num) {
+    if (likely(CPyTagged_CheckShort(num)
+               && num != (CPyTagged) ((Py_ssize_t)1 << (CPY_INT_BITS - 1)))) {
+        // The only possibility of an overflow error happening when negating a short is if we
+        // attempt to negate the most negative number.
+        return -num;
+    }
+    return CPyTagged_Negate_(num);
+}
+
+static inline CPyTagged CPyTagged_Add(CPyTagged left, CPyTagged right) {
+    // TODO: Use clang/gcc extension __builtin_saddll_overflow instead.
+    if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
+        CPyTagged sum = left + right;
+        if (likely(!CPyTagged_IsAddOverflow(sum, left, right))) {
+            return sum;
+        }
+    }
+    return CPyTagged_Add_(left, right);
+}
+
+static inline CPyTagged CPyTagged_Subtract(CPyTagged left, CPyTagged right) {
+    // TODO: Use clang/gcc extension __builtin_saddll_overflow instead.
+    if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
+        CPyTagged diff = left - right;
+        if (likely(!CPyTagged_IsSubtractOverflow(diff, left, right))) {
+            return diff;
+        }
+    }
+    return CPyTagged_Subtract_(left, right);
+}
+
+static inline CPyTagged CPyTagged_Multiply(CPyTagged left, CPyTagged right) {
+    // TODO: Consider using some clang/gcc extension to check for overflow
+    if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
+        if (!CPyTagged_IsMultiplyOverflow(left, right)) {
+            return left * CPyTagged_ShortAsSsize_t(right);
+        }
+    }
+    return CPyTagged_Multiply_(left, right);
+}
+
+static inline CPyTagged CPyTagged_FloorDivide(CPyTagged left, CPyTagged right) {
+    if (CPyTagged_CheckShort(left)
+        && CPyTagged_CheckShort(right)
+        && !CPyTagged_MaybeFloorDivideFault(left, right)) {
+        Py_ssize_t result = CPyTagged_ShortAsSsize_t(left) / CPyTagged_ShortAsSsize_t(right);
+        if (((Py_ssize_t)left < 0) != (((Py_ssize_t)right) < 0)) {
+            if (result * right != left) {
+                // Round down
+                result--;
+            }
+        }
+        return result << 1;
+    }
+    return CPyTagged_FloorDivide_(left, right);
+}
+
+static inline CPyTagged CPyTagged_Remainder(CPyTagged left, CPyTagged right) {
+    if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)
+        && !CPyTagged_MaybeRemainderFault(left, right)) {
+        Py_ssize_t result = (Py_ssize_t)left % (Py_ssize_t)right;
+        if (((Py_ssize_t)right < 0) != ((Py_ssize_t)left < 0) && result != 0) {
+            result += right;
+        }
+        return result;
+    }
+    return CPyTagged_Remainder_(left, right);
+}
+
+// Bitwise '~'
+static inline CPyTagged CPyTagged_Invert(CPyTagged num) {
+    if (likely(CPyTagged_CheckShort(num) && num != CPY_TAGGED_ABS_MIN)) {
+        return ~num & ~CPY_INT_TAG;
+    }
+    return CPyTagged_Invert_(num);
+}
+
+// Bitwise '&'
+static inline CPyTagged CPyTagged_And(CPyTagged left, CPyTagged right) {
+    if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
+        return left & right;
+    }
+    return CPyTagged_BitwiseLongOp_(left, right, '&');
+}
+
+// Bitwise '|'
+static inline CPyTagged CPyTagged_Or(CPyTagged left, CPyTagged right) {
+    if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
+        return left | right;
+    }
+    return CPyTagged_BitwiseLongOp_(left, right, '|');
+}
+
+// Bitwise '^'
+static inline CPyTagged CPyTagged_Xor(CPyTagged left, CPyTagged right) {
+    if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
+        return left ^ right;
+    }
+    return CPyTagged_BitwiseLongOp_(left, right, '^');
+}
+
+// Bitwise '>>'
+static inline CPyTagged CPyTagged_Rshift(CPyTagged left, CPyTagged right) {
+    if (likely(CPyTagged_CheckShort(left)
+               && CPyTagged_CheckShort(right)
+               && (Py_ssize_t)right >= 0)) {
+        CPyTagged count = CPyTagged_ShortAsSsize_t(right);
+        if (unlikely(count >= CPY_INT_BITS)) {
+            if ((Py_ssize_t)left >= 0) {
+                return 0;
+            } else {
+                return CPyTagged_ShortFromInt(-1);
+            }
+        }
+        return ((Py_ssize_t)left >> count) & ~CPY_INT_TAG;
+    }
+    return CPyTagged_Rshift_(left, right);
+}
+
+static inline bool IsShortLshiftOverflow(Py_ssize_t short_int, Py_ssize_t shift) {
+    return ((Py_ssize_t)(short_int << shift) >> shift) != short_int;
+}
+
+// Bitwise '<<'
+static inline CPyTagged CPyTagged_Lshift(CPyTagged left, CPyTagged right) {
+    if (likely(CPyTagged_CheckShort(left)
+               && CPyTagged_CheckShort(right)
+               && (Py_ssize_t)right >= 0
+               && right < CPY_INT_BITS * 2)) {
+        CPyTagged shift = CPyTagged_ShortAsSsize_t(right);
+        if (!IsShortLshiftOverflow(left, shift))
+            // Short integers, no overflow
+            return left << shift;
+    }
+    return CPyTagged_Lshift_(left, right);
+}
+
+
+// Float operations
+
+
+double CPyFloat_FloorDivide(double x, double y);
+double CPyFloat_Pow(double x, double y);
+double CPyFloat_Sin(double x);
+double CPyFloat_Cos(double x);
+double CPyFloat_Tan(double x);
+double CPyFloat_Sqrt(double x);
+double CPyFloat_Exp(double x);
+double CPyFloat_Log(double x);
+CPyTagged CPyFloat_Floor(double x);
+CPyTagged CPyFloat_Ceil(double x);
+double CPyFloat_FromTagged(CPyTagged x);
+bool CPyFloat_IsInf(double x);
+bool CPyFloat_IsNaN(double x);
+
+
+// Generic operations (that work with arbitrary types)
+
+
+/* We use intentionally non-inlined decrefs in rarely executed code
+ * paths since it pretty substantially speeds up compile time. We have
+ * our own copies both to avoid the null check in Py_DecRef and to avoid
+ * making an indirect PIC call. */
+CPy_NOINLINE
+static void CPy_DecRef(PyObject *p) {
+    CPy_DECREF(p);
+}
+
+CPy_NOINLINE
+static void CPy_XDecRef(PyObject *p) {
+    CPy_XDECREF(p);
+}
+
+static inline CPyTagged CPyObject_Size(PyObject *obj) {
+    Py_ssize_t s = PyObject_Size(obj);
+    if (s < 0) {
+        return CPY_INT_TAG;
+    } else {
+        // Technically __len__ could return a really big number, so we
+        // should allow this to produce a boxed int. In practice it
+        // shouldn't ever if the data structure actually contains all
+        // the elements, but...
+        return CPyTagged_FromSsize_t(s);
+    }
+}
+
+#ifdef MYPYC_LOG_GETATTR
+static void CPy_LogGetAttr(const char *method, PyObject *obj, PyObject *attr) {
+    PyObject *module = PyImport_ImportModule("getattr_hook");
+    if (module) {
+        PyObject *res = PyObject_CallMethodObjArgs(module, method, obj, attr, NULL);
+        Py_XDECREF(res);
+        Py_DECREF(module);
+    }
+    PyErr_Clear();
+}
+#else
+#define CPy_LogGetAttr(method, obj, attr) (void)0
+#endif
+
+// Intercept a method call and log it. This needs to be a macro
+// because there is no API that accepts va_args for making a
+// call. Worse, it needs to use the comma operator to return the right
+// value.
+#define CPyObject_CallMethodObjArgs(obj, attr, ...)             \
+    (CPy_LogGetAttr("log_method", (obj), (attr)),               \
+     PyObject_CallMethodObjArgs((obj), (attr), __VA_ARGS__))
+
+// This one is a macro for consistency with the above, I guess.
+#define CPyObject_GetAttr(obj, attr)                       \
+    (CPy_LogGetAttr("log", (obj), (attr)),                 \
+     PyObject_GetAttr((obj), (attr)))
+
+CPyTagged CPyObject_Hash(PyObject *o);
+PyObject *CPyObject_GetAttr3(PyObject *v, PyObject *name, PyObject *defl);
+PyObject *CPyIter_Next(PyObject *iter);
+PyObject *CPyNumber_Power(PyObject *base, PyObject *index);
+PyObject *CPyNumber_InPlacePower(PyObject *base, PyObject *index);
+PyObject *CPyObject_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
+
+
+// List operations
+
+
+PyObject *CPyList_Build(Py_ssize_t len, ...);
+PyObject *CPyList_GetItem(PyObject *list, CPyTagged index);
+PyObject *CPyList_GetItemShort(PyObject *list, CPyTagged index);
+PyObject *CPyList_GetItemBorrow(PyObject *list, CPyTagged index);
+PyObject *CPyList_GetItemShortBorrow(PyObject *list, CPyTagged index);
+PyObject *CPyList_GetItemInt64(PyObject *list, int64_t index);
+PyObject *CPyList_GetItemInt64Borrow(PyObject *list, int64_t index);
+bool CPyList_SetItem(PyObject *list, CPyTagged index, PyObject *value);
+void CPyList_SetItemUnsafe(PyObject *list, Py_ssize_t index, PyObject *value);
+bool CPyList_SetItemInt64(PyObject *list, int64_t index, PyObject *value);
+PyObject *CPyList_PopLast(PyObject *obj);
+PyObject *CPyList_Pop(PyObject *obj, CPyTagged index);
+CPyTagged CPyList_Count(PyObject *obj, PyObject *value);
+int CPyList_Insert(PyObject *list, CPyTagged index, PyObject *value);
+PyObject *CPyList_Extend(PyObject *o1, PyObject *o2);
+int CPyList_Remove(PyObject *list, PyObject *obj);
+CPyTagged CPyList_Index(PyObject *list, PyObject *obj);
+PyObject *CPySequence_Sort(PyObject *seq);
+PyObject *CPySequence_Multiply(PyObject *seq, CPyTagged t_size);
+PyObject *CPySequence_RMultiply(CPyTagged t_size, PyObject *seq);
+PyObject *CPySequence_InPlaceMultiply(PyObject *seq, CPyTagged t_size);
+PyObject *CPyList_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
+char CPyList_Clear(PyObject *list);
+PyObject *CPyList_Copy(PyObject *list);
+int CPySequence_Check(PyObject *obj);
+
+
+// Dict operations
+
+
+PyObject *CPyDict_GetItem(PyObject *dict, PyObject *key);
+int CPyDict_SetItem(PyObject *dict, PyObject *key, PyObject *value);
+PyObject *CPyDict_Get(PyObject *dict, PyObject *key, PyObject *fallback);
+PyObject *CPyDict_GetWithNone(PyObject *dict, PyObject *key);
+PyObject *CPyDict_SetDefault(PyObject *dict, PyObject *key, PyObject *value);
+PyObject *CPyDict_SetDefaultWithNone(PyObject *dict, PyObject *key);
+PyObject *CPyDict_SetDefaultWithEmptyDatatype(PyObject *dict, PyObject *key, int data_type);
+PyObject *CPyDict_Build(Py_ssize_t size, ...);
+int CPyDict_Update(PyObject *dict, PyObject *stuff);
+int CPyDict_UpdateInDisplay(PyObject *dict, PyObject *stuff);
+int CPyDict_UpdateFromAny(PyObject *dict, PyObject *stuff);
+PyObject *CPyDict_FromAny(PyObject *obj);
+PyObject *CPyDict_KeysView(PyObject *dict);
+PyObject *CPyDict_ValuesView(PyObject *dict);
+PyObject *CPyDict_ItemsView(PyObject *dict);
+PyObject *CPyDict_Keys(PyObject *dict);
+PyObject *CPyDict_Values(PyObject *dict);
+PyObject *CPyDict_Items(PyObject *dict);
+char CPyDict_Clear(PyObject *dict);
+PyObject *CPyDict_Copy(PyObject *dict);
+PyObject *CPyDict_GetKeysIter(PyObject *dict);
+PyObject *CPyDict_GetItemsIter(PyObject *dict);
+PyObject *CPyDict_GetValuesIter(PyObject *dict);
+tuple_T3CIO CPyDict_NextKey(PyObject *dict_or_iter, CPyTagged offset);
+tuple_T3CIO CPyDict_NextValue(PyObject *dict_or_iter, CPyTagged offset);
+tuple_T4CIOO CPyDict_NextItem(PyObject *dict_or_iter, CPyTagged offset);
+int CPyMapping_Check(PyObject *obj);
+
+// Check that dictionary didn't change size during iteration.
+static inline char CPyDict_CheckSize(PyObject *dict, Py_ssize_t size) {
+    if (!PyDict_CheckExact(dict)) {
+        // Dict subclasses will be checked by Python runtime.
+        return 1;
+    }
+    Py_ssize_t dict_size = PyDict_Size(dict);
+    if (size != dict_size) {
+        PyErr_SetString(PyExc_RuntimeError, "dictionary changed size during iteration");
+        return 0;
+    }
+    return 1;
+}
+
+
+// Str operations
+
+// Macros for strip type. These values are copied from CPython.
+#define LEFTSTRIP  0
+#define RIGHTSTRIP 1
+#define BOTHSTRIP  2
+
+char CPyStr_Equal(PyObject *str1, PyObject *str2);
+char CPyStr_EqualLiteral(PyObject *str, PyObject *literal_str, Py_ssize_t literal_length);
+PyObject *CPyStr_Build(Py_ssize_t len, ...);
+PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index);
+PyObject *CPyStr_GetItemUnsafe(PyObject *str, Py_ssize_t index);
+CPyTagged CPyStr_Find(PyObject *str, PyObject *substr, CPyTagged start, int direction);
+CPyTagged CPyStr_FindWithEnd(PyObject *str, PyObject *substr, CPyTagged start, CPyTagged end, int direction);
+PyObject *CPyStr_Split(PyObject *str, PyObject *sep, CPyTagged max_split);
+PyObject *CPyStr_RSplit(PyObject *str, PyObject *sep, CPyTagged max_split);
+PyObject *_CPyStr_Strip(PyObject *self, int strip_type, PyObject *sep);
+static inline PyObject *CPyStr_Strip(PyObject *self, PyObject *sep) {
+    return _CPyStr_Strip(self, BOTHSTRIP, sep);
+}
+static inline PyObject *CPyStr_LStrip(PyObject *self, PyObject *sep) {
+    return _CPyStr_Strip(self, LEFTSTRIP, sep);
+}
+static inline PyObject *CPyStr_RStrip(PyObject *self, PyObject *sep) {
+    return _CPyStr_Strip(self, RIGHTSTRIP, sep);
+}
+PyObject *CPyStr_Replace(PyObject *str, PyObject *old_substr, PyObject *new_substr, CPyTagged max_replace);
+PyObject *CPyStr_Append(PyObject *o1, PyObject *o2);
+PyObject *CPyStr_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
+int CPyStr_Startswith(PyObject *self, PyObject *subobj);
+int CPyStr_Endswith(PyObject *self, PyObject *subobj);
+PyObject *CPyStr_Removeprefix(PyObject *self, PyObject *prefix);
+PyObject *CPyStr_Removesuffix(PyObject *self, PyObject *suffix);
+bool CPyStr_IsTrue(PyObject *obj);
+Py_ssize_t CPyStr_Size_size_t(PyObject *str);
+PyObject *CPy_Decode(PyObject *obj, PyObject *encoding, PyObject *errors);
+PyObject *CPy_DecodeUTF8(PyObject *bytes);
+PyObject *CPy_DecodeASCII(PyObject *bytes);
+PyObject *CPy_DecodeLatin1(PyObject *bytes);
+PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors);
+Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start);
+Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end);
+CPyTagged CPyStr_Ord(PyObject *obj);
+
+
+// Bytes operations
+
+
+PyObject *CPyBytes_Build(Py_ssize_t len, ...);
+PyObject *CPyBytes_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
+CPyTagged CPyBytes_GetItem(PyObject *o, CPyTagged index);
+PyObject *CPyBytes_Concat(PyObject *a, PyObject *b);
+PyObject *CPyBytes_Join(PyObject *sep, PyObject *iter);
+CPyTagged CPyBytes_Ord(PyObject *obj);
+
+
+int CPyBytes_Compare(PyObject *left, PyObject *right);
+
+
+// Set operations
+
+
+bool CPySet_Remove(PyObject *set, PyObject *key);
+
+
+// Tuple operations
+
+
+PyObject *CPySequenceTuple_GetItem(PyObject *tuple, CPyTagged index);
+PyObject *CPySequenceTuple_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
+PyObject *CPySequenceTuple_GetItemUnsafe(PyObject *tuple, Py_ssize_t index);
+void CPySequenceTuple_SetItemUnsafe(PyObject *tuple, Py_ssize_t index, PyObject *value);
+
+
+// Exception operations
+
+
+// mypyc is not very good at dealing with refcount management of
+// pointers that might be NULL. As a workaround for this, the
+// exception APIs that might want to return NULL pointers instead
+// return properly refcounted pointers to this dummy object.
+struct ExcDummyStruct { PyObject_HEAD };
+extern struct ExcDummyStruct _CPy_ExcDummyStruct;
+extern PyObject *_CPy_ExcDummy;
+
+static inline void _CPy_ToDummy(PyObject **p) {
+    if (*p == NULL) {
+        Py_INCREF(_CPy_ExcDummy);
+        *p = _CPy_ExcDummy;
+    }
+}
+
+static inline PyObject *_CPy_FromDummy(PyObject *p) {
+    if (p == _CPy_ExcDummy) return NULL;
+    Py_INCREF(p);
+    return p;
+}
+
+static int CPy_NoErrOccurred(void) {
+    return PyErr_Occurred() == NULL;
+}
+
+static inline bool CPy_KeepPropagating(void) {
+    return 0;
+}
+// We want to avoid the public PyErr_GetExcInfo API for these because
+// it requires a bunch of spurious refcount traffic on the parts of
+// the triple we don't care about.
+#define CPy_ExcState() PyThreadState_GET()->exc_info
+
+void CPy_Raise(PyObject *exc);
+void CPy_Reraise(void);
+void CPyErr_SetObjectAndTraceback(PyObject *type, PyObject *value, PyObject *traceback);
+tuple_T3OOO CPy_CatchError(void);
+void CPy_RestoreExcInfo(tuple_T3OOO info);
+bool CPy_ExceptionMatches(PyObject *type);
+PyObject *CPy_GetExcValue(void);
+tuple_T3OOO CPy_GetExcInfo(void);
+void _CPy_GetExcInfo(PyObject **p_type, PyObject **p_value, PyObject **p_traceback);
+void CPyError_OutOfMemory(void);
+void CPy_TypeError(const char *expected, PyObject *value);
+void CPy_AddTraceback(const char *filename, const char *funcname, int line, PyObject *globals);
+void CPy_TypeErrorTraceback(const char *filename, const char *funcname, int line,
+                            PyObject *globals, const char *expected, PyObject *value);
+void CPy_AttributeError(const char *filename, const char *funcname, const char *classname,
+                        const char *attrname, int line, PyObject *globals);
+
+
+// Misc operations
+
+#define CPy_TRASHCAN_BEGIN(op, dealloc) Py_TRASHCAN_BEGIN(op, dealloc)
+#define CPy_TRASHCAN_END(op) Py_TRASHCAN_END
+
+// Tweaked version of _PyArg_Parser in CPython
+typedef struct CPyArg_Parser {
+    const char *format;
+    const char * const *keywords;
+    const char *fname;
+    const char *custom_msg;
+    int pos;               /* number of positional-only arguments */
+    int min;               /* minimal number of arguments */
+    int max;               /* maximal number of positional arguments */
+    int has_required_kws;  /* are there any keyword-only arguments? */
+    int required_kwonly_start;
+    int varargs;           /* does the function accept *args or **kwargs? */
+    PyObject *kwtuple;     /* tuple of keyword parameter names */
+    struct CPyArg_Parser *next;
+} CPyArg_Parser;
+
+// mypy lets ints silently coerce to floats, so a mypyc runtime float
+// might be an int also
+static inline bool CPyFloat_Check(PyObject *o) {
+    return PyFloat_Check(o) || PyLong_Check(o);
+}
+
+// TODO: find an unified way to avoid inline functions in non-C back ends that can not
+//       use inline functions
+static inline bool CPy_TypeCheck(PyObject *o, PyObject *type) {
+    return PyObject_TypeCheck(o, (PyTypeObject *)type);
+}
+
+static inline PyObject *CPy_TYPE(PyObject *obj) {
+    PyObject *result = (PyObject *)Py_TYPE(obj);
+    Py_INCREF(result);
+    return result;
+}
+
+PyObject *CPy_CalculateMetaclass(PyObject *type, PyObject *o);
+PyObject *CPy_GetCoro(PyObject *obj);
+PyObject *CPyIter_Send(PyObject *iter, PyObject *val);
+int CPy_YieldFromErrorHandle(PyObject *iter, PyObject **outp);
+PyObject *CPy_FetchStopIterationValue(void);
+PyObject *CPyType_FromTemplate(PyObject *template_,
+                               PyObject *orig_bases,
+                               PyObject *modname);
+PyObject *CPyType_FromTemplateWrapper(PyObject *template_,
+                                      PyObject *orig_bases,
+                                      PyObject *modname);
+int CPyDataclass_SleightOfHand(PyObject *dataclass_dec, PyObject *tp,
+                               PyObject *dict, PyObject *annotations,
+                               PyObject *dataclass_type);
+PyObject *CPyPickle_SetState(PyObject *obj, PyObject *state);
+PyObject *CPyPickle_GetState(PyObject *obj);
+CPyTagged CPyTagged_Id(PyObject *o);
+void CPyDebug_Print(const char *msg);
+void CPyDebug_PrintObject(PyObject *obj);
+void CPy_Init(void);
+int CPyArg_ParseTupleAndKeywords(PyObject *, PyObject *,
+                                 const char *, const char *, const char * const *, ...);
+int CPyArg_ParseStackAndKeywords(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
+                                 CPyArg_Parser *parser, ...);
+int CPyArg_ParseStackAndKeywordsNoArgs(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
+                                       CPyArg_Parser *parser, ...);
+int CPyArg_ParseStackAndKeywordsOneArg(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
+                                       CPyArg_Parser *parser, ...);
+int CPyArg_ParseStackAndKeywordsSimple(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
+                                       CPyArg_Parser *parser, ...);
+
+int CPySequence_CheckUnpackCount(PyObject *sequence, Py_ssize_t expected);
+int CPyStatics_Initialize(PyObject **statics,
+                          const char * const *strings,
+                          const char * const *bytestrings,
+                          const char * const *ints,
+                          const double *floats,
+                          const double *complex_numbers,
+                          const int *tuples,
+                          const int *frozensets);
+PyObject *CPy_Super(PyObject *builtins, PyObject *self);
+PyObject *CPy_CallReverseOpMethod(PyObject *left, PyObject *right, const char *op,
+                                  _Py_Identifier *method);
+
+bool CPyImport_ImportMany(PyObject *modules, CPyModule **statics[], PyObject *globals,
+                          PyObject *tb_path, PyObject *tb_function, Py_ssize_t *tb_lines);
+PyObject *CPyImport_ImportFromMany(PyObject *mod_id, PyObject *names, PyObject *as_names,
+                                   PyObject *globals);
+
+PyObject *CPySingledispatch_RegisterFunction(PyObject *singledispatch_func, PyObject *cls,
+                                             PyObject *func);
+
+PyObject *CPy_GetAIter(PyObject *obj);
+PyObject *CPy_GetANext(PyObject *aiter);
+void CPy_SetTypeAliasTypeComputeFunction(PyObject *alias, PyObject *compute_value);
+void CPyTrace_LogEvent(const char *location, const char *line, const char *op, const char *details);
+
+static inline PyObject *CPyObject_GenericGetAttr(PyObject *self, PyObject *name) {
+    return _PyObject_GenericGetAttrWithDict(self, name, NULL, 1);
+}
+static inline int CPyObject_GenericSetAttr(PyObject *self, PyObject *name, PyObject *value) {
+    return _PyObject_GenericSetAttrWithDict(self, name, value, NULL);
+}
+
+PyObject *CPy_SetupObject(PyObject *type);
+
+#if CPY_3_11_FEATURES
+PyObject *CPy_GetName(PyObject *obj);
+#endif
+
+#if CPY_3_14_FEATURES
+void CPy_SetImmortal(PyObject *obj);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // CPY_CPY_H
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx/codec.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx/codec.c
@@ -0,0 +1,68 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_AVX
+#include <immintrin.h>
+
+// Only enable inline assembly on supported compilers and on 64-bit CPUs.
+#ifndef BASE64_AVX_USE_ASM
+# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
+#  define BASE64_AVX_USE_ASM 1
+# else
+#  define BASE64_AVX_USE_ASM 0
+# endif
+#endif
+
+#include "../ssse3/dec_reshuffle.c"
+#include "../ssse3/dec_loop.c"
+
+#if BASE64_AVX_USE_ASM
+# include "./enc_loop_asm.c"
+#else
+# include "../ssse3/enc_translate.c"
+# include "../ssse3/enc_reshuffle.c"
+# include "../ssse3/enc_loop.c"
+#endif
+
+#endif	// HAVE_AVX
+
+void
+base64_stream_encode_avx BASE64_ENC_PARAMS
+{
+#if HAVE_AVX
+	#include "../generic/enc_head.c"
+
+	// For supported compilers, use a hand-optimized inline assembly
+	// encoder. Otherwise fall back on the SSSE3 encoder, but compiled with
+	// AVX flags to generate better optimized AVX code.
+
+#if BASE64_AVX_USE_ASM
+	enc_loop_avx(&s, &slen, &o, &olen);
+#else
+	enc_loop_ssse3(&s, &slen, &o, &olen);
+#endif
+
+	#include "../generic/enc_tail.c"
+#else
+	base64_enc_stub(state, src, srclen, out, outlen);
+#endif
+}
+
+int
+base64_stream_decode_avx BASE64_DEC_PARAMS
+{
+#if HAVE_AVX
+	#include "../generic/dec_head.c"
+	dec_loop_ssse3(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	return base64_dec_stub(state, src, srclen, out, outlen);
+#endif
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx/enc_loop_asm.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx/enc_loop_asm.c
@@ -0,0 +1,264 @@
+// Apologies in advance for combining the preprocessor with inline assembly,
+// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
+// code repetition. The preprocessor is used to template large sections of
+// inline assembly that differ only in the registers used. If the code was
+// written out by hand, it would become very large and hard to audit.
+
+// Generate a block of inline assembly that loads register R0 from memory. The
+// offset at which the register is loaded is set by the given round.
+#define LOAD(R0, ROUND) \
+	"vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
+
+// Generate a block of inline assembly that deinterleaves and shuffles register
+// R0 using preloaded constants. Outputs in R0 and R1.
+#define SHUF(R0, R1, R2) \
+	"vpshufb  %[lut0], %["R0"], %["R1"] \n\t" \
+	"vpand    %["R1"], %[msk0], %["R2"] \n\t" \
+	"vpand    %["R1"], %[msk2], %["R1"] \n\t" \
+	"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
+	"vpmullw  %["R1"], %[msk3], %["R1"] \n\t" \
+	"vpor     %["R1"], %["R2"], %["R1"] \n\t"
+
+// Generate a block of inline assembly that takes R0 and R1 and translates
+// their contents to the base64 alphabet, using preloaded constants.
+#define TRAN(R0, R1, R2) \
+	"vpsubusb %[n51],  %["R1"], %["R0"] \n\t" \
+	"vpcmpgtb %[n25],  %["R1"], %["R2"] \n\t" \
+	"vpsubb   %["R2"], %["R0"], %["R0"] \n\t" \
+	"vpshufb  %["R0"], %[lut1], %["R2"] \n\t" \
+	"vpaddb   %["R1"], %["R2"], %["R0"] \n\t"
+
+// Generate a block of inline assembly that stores the given register R0 at an
+// offset set by the given round.
+#define STOR(R0, ROUND) \
+	"vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
+
+// Generate a block of inline assembly that generates a single self-contained
+// encoder round: fetch the data, process it, and store the result. Then update
+// the source and destination pointers.
+#define ROUND() \
+	LOAD("a", 0) \
+	SHUF("a", "b", "c") \
+	TRAN("a", "b", "c") \
+	STOR("a", 0) \
+	"add $12, %[src] \n\t" \
+	"add $16, %[dst] \n\t"
+
+// Define a macro that initiates a three-way interleaved encoding round by
+// preloading registers a, b and c from memory.
+// The register graph shows which registers are in use during each step, and
+// is a visual aid for choosing registers for that step. Symbol index:
+//
+//  +  indicates that a register is loaded by that step.
+//  |  indicates that a register is in use and must not be touched.
+//  -  indicates that a register is decommissioned by that step.
+//  x  indicates that a register is used as a temporary by that step.
+//  V  indicates that a register is an input or output to the macro.
+//
+#define ROUND_3_INIT() 			/*  a b c d e f  */ \
+	LOAD("a",   0)			/*  +            */ \
+	SHUF("a", "d", "e")		/*  |     + x    */ \
+	LOAD("b",   1)			/*  | +   |      */ \
+	TRAN("a", "d", "e")		/*  | |   - x    */ \
+	LOAD("c",   2)			/*  V V V        */
+
+// Define a macro that translates, shuffles and stores the input registers A, B
+// and C, and preloads registers D, E and F for the next round.
+// This macro can be arbitrarily daisy-chained by feeding output registers D, E
+// and F back into the next round as input registers A, B and C. The macro
+// carefully interleaves memory operations with data operations for optimal
+// pipelined performance.
+
+#define ROUND_3(ROUND, A,B,C,D,E,F) 	/*  A B C D E F  */ \
+	LOAD(D, (ROUND + 3))		/*  V V V +      */ \
+	SHUF(B, E, F)			/*  | | | | + x  */ \
+	STOR(A, (ROUND + 0))		/*  - | | | |    */ \
+	TRAN(B, E, F)			/*    | | | - x  */ \
+	LOAD(E, (ROUND + 4))		/*    | | | +    */ \
+	SHUF(C, A, F)			/*  + | | | | x  */ \
+	STOR(B, (ROUND + 1))		/*  | - | | |    */ \
+	TRAN(C, A, F)			/*  -   | | | x  */ \
+	LOAD(F, (ROUND + 5))		/*      | | | +  */ \
+	SHUF(D, A, B)			/*  + x | | | |  */ \
+	STOR(C, (ROUND + 2))		/*  |   - | | |  */ \
+	TRAN(D, A, B)			/*  - x   V V V  */
+
+// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
+// registers D, E and F, and translating, shuffling and storing them.
+#define ROUND_3_END(ROUND, A,B,C,D,E,F)	/*  A B C D E F  */ \
+	SHUF(E, A, B)			/*  + x   V V V  */ \
+	STOR(D, (ROUND + 3))		/*  |     - | |  */ \
+	TRAN(E, A, B)			/*  - x     | |  */ \
+	SHUF(F, C, D)			/*      + x | |  */ \
+	STOR(E, (ROUND + 4))		/*      |   - |  */ \
+	TRAN(F, C, D)			/*      - x   |  */ \
+	STOR(F, (ROUND + 5))		/*            -  */
+
+// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
+#define ROUND_3_A(ROUND) \
+	ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
+
+// Define a type B round. Inputs and outputs are swapped with regard to type A.
+#define ROUND_3_B(ROUND) \
+	ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
+
+// Terminating macro for a type A round.
+#define ROUND_3_A_LAST(ROUND) \
+	ROUND_3_A(ROUND) \
+	ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
+
+// Terminating macro for a type B round.
+#define ROUND_3_B_LAST(ROUND) \
+	ROUND_3_B(ROUND) \
+	ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
+
+// Suppress clang's warning that the literal string in the asm statement is
+// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
+// compilers). It may be true, but the goal here is not C99 portability.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+
+static inline void
+enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	// For a clearer explanation of the algorithm used by this function,
+	// please refer to the plain (not inline assembly) implementation. This
+	// function follows the same basic logic.
+
+	if (*slen < 16) {
+		return;
+	}
+
+	// Process blocks of 12 bytes at a time. Input is read in blocks of 16
+	// bytes, so "reserve" four bytes from the input buffer to ensure that
+	// we never read beyond the end of the input buffer.
+	size_t rounds = (*slen - 4) / 12;
+
+	*slen -= rounds * 12;   // 12 bytes consumed per round
+	*olen += rounds * 16;   // 16 bytes produced per round
+
+	// Number of times to go through the 36x loop.
+	size_t loops = rounds / 36;
+
+	// Number of rounds remaining after the 36x loop.
+	rounds %= 36;
+
+	// Lookup tables.
+	const __m128i lut0 = _mm_set_epi8(
+		10, 11,  9, 10,  7,  8,  6,  7,  4,  5,  3,  4,  1,  2,  0,  1);
+
+	const __m128i lut1 = _mm_setr_epi8(
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
+
+	// Temporary registers.
+	__m128i a, b, c, d, e, f;
+
+	__asm__ volatile (
+
+		// If there are 36 rounds or more, enter a 36x unrolled loop of
+		// interleaved encoding rounds. The rounds interleave memory
+		// operations (load/store) with data operations (table lookups,
+		// etc) to maximize pipeline throughput.
+		"    test %[loops], %[loops] \n\t"
+		"    jz   18f                \n\t"
+		"    jmp  36f                \n\t"
+		"                            \n\t"
+		".balign 64                  \n\t"
+		"36: " ROUND_3_INIT()
+		"    " ROUND_3_A( 0)
+		"    " ROUND_3_B( 3)
+		"    " ROUND_3_A( 6)
+		"    " ROUND_3_B( 9)
+		"    " ROUND_3_A(12)
+		"    " ROUND_3_B(15)
+		"    " ROUND_3_A(18)
+		"    " ROUND_3_B(21)
+		"    " ROUND_3_A(24)
+		"    " ROUND_3_B(27)
+		"    " ROUND_3_A_LAST(30)
+		"    add $(12 * 36), %[src] \n\t"
+		"    add $(16 * 36), %[dst] \n\t"
+		"    dec %[loops]           \n\t"
+		"    jnz 36b                \n\t"
+
+		// Enter an 18x unrolled loop for rounds of 18 or more.
+		"18: cmp $18, %[rounds] \n\t"
+		"    jl  9f             \n\t"
+		"    " ROUND_3_INIT()
+		"    " ROUND_3_A(0)
+		"    " ROUND_3_B(3)
+		"    " ROUND_3_A(6)
+		"    " ROUND_3_B(9)
+		"    " ROUND_3_A_LAST(12)
+		"    sub $18,        %[rounds] \n\t"
+		"    add $(12 * 18), %[src]    \n\t"
+		"    add $(16 * 18), %[dst]    \n\t"
+
+		// Enter a 9x unrolled loop for rounds of 9 or more.
+		"9:  cmp $9, %[rounds] \n\t"
+		"    jl  6f            \n\t"
+		"    " ROUND_3_INIT()
+		"    " ROUND_3_A(0)
+		"    " ROUND_3_B_LAST(3)
+		"    sub $9,        %[rounds] \n\t"
+		"    add $(12 * 9), %[src]    \n\t"
+		"    add $(16 * 9), %[dst]    \n\t"
+
+		// Enter a 6x unrolled loop for rounds of 6 or more.
+		"6:  cmp $6, %[rounds] \n\t"
+		"    jl  55f           \n\t"
+		"    " ROUND_3_INIT()
+		"    " ROUND_3_A_LAST(0)
+		"    sub $6,        %[rounds] \n\t"
+		"    add $(12 * 6), %[src]    \n\t"
+		"    add $(16 * 6), %[dst]    \n\t"
+
+		// Dispatch the remaining rounds 0..5.
+		"55: cmp $3, %[rounds] \n\t"
+		"    jg  45f           \n\t"
+		"    je  3f            \n\t"
+		"    cmp $1, %[rounds] \n\t"
+		"    jg  2f            \n\t"
+		"    je  1f            \n\t"
+		"    jmp 0f            \n\t"
+
+		"45: cmp $4, %[rounds] \n\t"
+		"    je  4f            \n\t"
+
+		// Block of non-interlaced encoding rounds, which can each
+		// individually be jumped to. Rounds fall through to the next.
+		"5: " ROUND()
+		"4: " ROUND()
+		"3: " ROUND()
+		"2: " ROUND()
+		"1: " ROUND()
+		"0: \n\t"
+
+		// Outputs (modified).
+		: [rounds] "+r"  (rounds),
+		  [loops]  "+r"  (loops),
+		  [src]    "+r"  (*s),
+		  [dst]    "+r"  (*o),
+		  [a]      "=&x" (a),
+		  [b]      "=&x" (b),
+		  [c]      "=&x" (c),
+		  [d]      "=&x" (d),
+		  [e]      "=&x" (e),
+		  [f]      "=&x" (f)
+
+		// Inputs (not modified).
+		: [lut0] "x" (lut0),
+		  [lut1] "x" (lut1),
+		  [msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
+		  [msk1] "x" (_mm_set1_epi32(0x04000040)),
+		  [msk2] "x" (_mm_set1_epi32(0x003F03F0)),
+		  [msk3] "x" (_mm_set1_epi32(0x01000010)),
+		  [n51]  "x" (_mm_set1_epi8(51)),
+		  [n25]  "x" (_mm_set1_epi8(25))
+
+		// Clobbers.
+		: "cc", "memory"
+	);
+}
+
+#pragma GCC diagnostic pop
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/codec.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/codec.c
@@ -0,0 +1,58 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_AVX2
+#include <immintrin.h>
+
+// Only enable inline assembly on supported compilers and on 64-bit CPUs.
+#ifndef BASE64_AVX2_USE_ASM
+# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
+#  define BASE64_AVX2_USE_ASM 1
+# else
+#  define BASE64_AVX2_USE_ASM 0
+# endif
+#endif
+
+#include "./dec_reshuffle.c"
+#include "./dec_loop.c"
+
+#if BASE64_AVX2_USE_ASM
+# include "./enc_loop_asm.c"
+#else
+# include "./enc_translate.c"
+# include "./enc_reshuffle.c"
+# include "./enc_loop.c"
+#endif
+
+#endif	// HAVE_AVX2
+
+void
+base64_stream_encode_avx2 BASE64_ENC_PARAMS
+{
+#if HAVE_AVX2
+	#include "../generic/enc_head.c"
+	enc_loop_avx2(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	base64_enc_stub(state, src, srclen, out, outlen);
+#endif
+}
+
+int
+base64_stream_decode_avx2 BASE64_DEC_PARAMS
+{
+#if HAVE_AVX2
+	#include "../generic/dec_head.c"
+	dec_loop_avx2(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	return base64_dec_stub(state, src, srclen, out, outlen);
+#endif
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/dec_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/dec_loop.c
@@ -0,0 +1,110 @@
+static BASE64_FORCE_INLINE int
+dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+	const __m256i lut_lo = _mm256_setr_epi8(
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
+
+	const __m256i lut_hi = _mm256_setr_epi8(
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
+
+	const __m256i lut_roll = _mm256_setr_epi8(
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0,
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0);
+
+	const __m256i mask_2F = _mm256_set1_epi8(0x2F);
+
+	// Load input:
+	__m256i str = _mm256_loadu_si256((__m256i *) *s);
+
+	// See the SSSE3 decoder for an explanation of the algorithm.
+	const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
+	const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
+	const __m256i hi         = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
+	const __m256i lo         = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
+
+	if (!_mm256_testz_si256(lo, hi)) {
+		return 0;
+	}
+
+	const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
+	const __m256i roll  = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
+
+	// Now simply add the delta values to the input:
+	str = _mm256_add_epi8(str, roll);
+
+	// Reshuffle the input to packed 12-byte output format:
+	str = dec_reshuffle(str);
+
+	// Store the output:
+	_mm256_storeu_si256((__m256i *) *o, str);
+
+	*s += 32;
+	*o += 24;
+	*rounds -= 1;
+
+	return 1;
+}
+
+static inline void
+dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 45) {
+		return;
+	}
+
+	// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
+	// written after the output, ensure that there will be at least 13
+	// bytes of input data left to cover the gap. (11 data bytes and up to
+	// two end-of-string markers.)
+	size_t rounds = (*slen - 13) / 32;
+
+	*slen -= rounds * 32;	// 32 bytes consumed per round
+	*olen += rounds * 24;	// 24 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 4) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 2) {
+			if (dec_loop_avx2_inner(s, o, &rounds) &&
+			    dec_loop_avx2_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		dec_loop_avx2_inner(s, o, &rounds);
+		break;
+
+	} while (rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 32;
+	*olen -= rounds * 24;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/dec_reshuffle.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/dec_reshuffle.c
@@ -0,0 +1,34 @@
+static BASE64_FORCE_INLINE __m256i
+dec_reshuffle (const __m256i in)
+{
+	// in, lower lane, bits, upper case are most significant bits, lower
+	// case are least significant bits:
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+	const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
+	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
+	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
+	// 0000eeee FFffffff 0000DDDD DDddEEEE
+	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
+
+	__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
+	// 00000000 JJJJJJjj KKKKkkkk LLllllll
+	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
+	// 00000000 DDDDDDdd EEEEeeee FFffffff
+	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
+
+	// Pack bytes together in each lane:
+	out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
+		2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
+	// 00000000 00000000 00000000 00000000
+	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
+	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
+	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
+
+	// Pack lanes:
+	return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/enc_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/enc_loop.c
@@ -0,0 +1,89 @@
+static BASE64_FORCE_INLINE void
+enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
+{
+	// First load is done at s - 0 to not get a segfault:
+	__m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+	// Shift by 4 bytes, as required by enc_reshuffle:
+	src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
+
+	// Reshuffle, translate, store:
+	src = enc_reshuffle(src);
+	src = enc_translate(src);
+	_mm256_storeu_si256((__m256i *) *o, src);
+
+	// Subsequent loads will be done at s - 4, set pointer for next round:
+	*s += 20;
+	*o += 32;
+}
+
+static BASE64_FORCE_INLINE void
+enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load input:
+	__m256i src = _mm256_loadu_si256((__m256i *) *s);
+
+	// Reshuffle, translate, store:
+	src = enc_reshuffle(src);
+	src = enc_translate(src);
+	_mm256_storeu_si256((__m256i *) *o, src);
+
+	*s += 24;
+	*o += 32;
+}
+
+static inline void
+enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 32) {
+		return;
+	}
+
+	// Process blocks of 24 bytes at a time. Because blocks are loaded 32
+	// bytes at a time an offset of -4, ensure that there will be at least
+	// 4 remaining bytes after the last round, so that the final read will
+	// not pass beyond the bounds of the input buffer:
+	size_t rounds = (*slen - 4) / 24;
+
+	*slen -= rounds * 24;   // 24 bytes consumed per round
+	*olen += rounds * 32;   // 32 bytes produced per round
+
+	// The first loop iteration requires special handling to ensure that
+	// the read, which is done at an offset, does not underflow the buffer:
+	enc_loop_avx2_inner_first(s, o);
+	rounds--;
+
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_avx2_inner(s, o);
+			enc_loop_avx2_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_avx2_inner(s, o);
+		break;
+	}
+
+	// Add the offset back:
+	*s += 4;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/enc_loop_asm.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/enc_loop_asm.c
@@ -0,0 +1,291 @@
+// Apologies in advance for combining the preprocessor with inline assembly,
+// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
+// code repetition. The preprocessor is used to template large sections of
+// inline assembly that differ only in the registers used. If the code was
+// written out by hand, it would become very large and hard to audit.
+
+// Generate a block of inline assembly that loads register R0 from memory. The
+// offset at which the register is loaded is set by the given round and a
+// constant offset.
+#define LOAD(R0, ROUND, OFFSET) \
+	"vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t"
+
+// Generate a block of inline assembly that deinterleaves and shuffles register
+// R0 using preloaded constants. Outputs in R0 and R1.
+#define SHUF(R0, R1, R2) \
+	"vpshufb  %[lut0], %["R0"], %["R1"] \n\t" \
+	"vpand    %["R1"], %[msk0], %["R2"] \n\t" \
+	"vpand    %["R1"], %[msk2], %["R1"] \n\t" \
+	"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
+	"vpmullw  %["R1"], %[msk3], %["R1"] \n\t" \
+	"vpor     %["R1"], %["R2"], %["R1"] \n\t"
+
+// Generate a block of inline assembly that takes R0 and R1 and translates
+// their contents to the base64 alphabet, using preloaded constants.
+#define TRAN(R0, R1, R2) \
+	"vpsubusb %[n51],  %["R1"], %["R0"] \n\t" \
+	"vpcmpgtb %[n25],  %["R1"], %["R2"] \n\t" \
+	"vpsubb   %["R2"], %["R0"], %["R0"] \n\t" \
+	"vpshufb  %["R0"], %[lut1], %["R2"] \n\t" \
+	"vpaddb   %["R1"], %["R2"], %["R0"] \n\t"
+
+// Generate a block of inline assembly that stores the given register R0 at an
+// offset set by the given round.
+#define STOR(R0, ROUND) \
+	"vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t"
+
+// Generate a block of inline assembly that generates a single self-contained
+// encoder round: fetch the data, process it, and store the result. Then update
+// the source and destination pointers.
+#define ROUND() \
+	LOAD("a", 0, -4) \
+	SHUF("a", "b", "c") \
+	TRAN("a", "b", "c") \
+	STOR("a", 0) \
+	"add $24, %[src] \n\t" \
+	"add $32, %[dst] \n\t"
+
+// Define a macro that initiates a three-way interleaved encoding round by
+// preloading registers a, b and c from memory.
+// The register graph shows which registers are in use during each step, and
+// is a visual aid for choosing registers for that step. Symbol index:
+//
+//  +  indicates that a register is loaded by that step.
+//  |  indicates that a register is in use and must not be touched.
+//  -  indicates that a register is decommissioned by that step.
+//  x  indicates that a register is used as a temporary by that step.
+//  V  indicates that a register is an input or output to the macro.
+//
+#define ROUND_3_INIT() 			/*  a b c d e f  */ \
+	LOAD("a",   0,  -4)		/*  +            */ \
+	SHUF("a", "d", "e")		/*  |     + x    */ \
+	LOAD("b",   1,  -4)		/*  | +   |      */ \
+	TRAN("a", "d", "e")		/*  | |   - x    */ \
+	LOAD("c",   2,  -4)		/*  V V V        */
+
+// Define a macro that translates, shuffles and stores the input registers A, B
+// and C, and preloads registers D, E and F for the next round.
+// This macro can be arbitrarily daisy-chained by feeding output registers D, E
+// and F back into the next round as input registers A, B and C. The macro
+// carefully interleaves memory operations with data operations for optimal
+// pipelined performance.
+
+#define ROUND_3(ROUND, A,B,C,D,E,F) 	/*  A B C D E F  */ \
+	LOAD(D, (ROUND + 3), -4)	/*  V V V +      */ \
+	SHUF(B, E, F)			/*  | | | | + x  */ \
+	STOR(A, (ROUND + 0))		/*  - | | | |    */ \
+	TRAN(B, E, F)			/*    | | | - x  */ \
+	LOAD(E, (ROUND + 4), -4)	/*    | | | +    */ \
+	SHUF(C, A, F)			/*  + | | | | x  */ \
+	STOR(B, (ROUND + 1))		/*  | - | | |    */ \
+	TRAN(C, A, F)			/*  -   | | | x  */ \
+	LOAD(F, (ROUND + 5), -4)	/*      | | | +  */ \
+	SHUF(D, A, B)			/*  + x | | | |  */ \
+	STOR(C, (ROUND + 2))		/*  |   - | | |  */ \
+	TRAN(D, A, B)			/*  - x   V V V  */
+
+// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
+// registers D, E and F, and translating, shuffling and storing them.
+#define ROUND_3_END(ROUND, A,B,C,D,E,F)	/*  A B C D E F  */ \
+	SHUF(E, A, B)			/*  + x   V V V  */ \
+	STOR(D, (ROUND + 3))		/*  |     - | |  */ \
+	TRAN(E, A, B)			/*  - x     | |  */ \
+	SHUF(F, C, D)			/*      + x | |  */ \
+	STOR(E, (ROUND + 4))		/*      |   - |  */ \
+	TRAN(F, C, D)			/*      - x   |  */ \
+	STOR(F, (ROUND + 5))		/*            -  */
+
+// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
+#define ROUND_3_A(ROUND) \
+	ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
+
+// Define a type B round. Inputs and outputs are swapped with regard to type A.
+#define ROUND_3_B(ROUND) \
+	ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
+
+// Terminating macro for a type A round.
+#define ROUND_3_A_LAST(ROUND) \
+	ROUND_3_A(ROUND) \
+	ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
+
+// Terminating macro for a type B round.
+#define ROUND_3_B_LAST(ROUND) \
+	ROUND_3_B(ROUND) \
+	ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
+
+// Suppress clang's warning that the literal string in the asm statement is
+// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
+// compilers). It may be true, but the goal here is not C99 portability.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+
+static inline void
+enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	// For a clearer explanation of the algorithm used by this function,
+	// please refer to the plain (not inline assembly) implementation. This
+	// function follows the same basic logic.
+
+	if (*slen < 32) {
+		return;
+	}
+
+	// Process blocks of 24 bytes at a time. Because blocks are loaded 32
+	// bytes at a time an offset of -4, ensure that there will be at least
+	// 4 remaining bytes after the last round, so that the final read will
+	// not pass beyond the bounds of the input buffer.
+	size_t rounds = (*slen - 4) / 24;
+
+	*slen -= rounds * 24;   // 24 bytes consumed per round
+	*olen += rounds * 32;   // 32 bytes produced per round
+
+	// Pre-decrement the number of rounds to get the number of rounds
+	// *after* the first round, which is handled as a special case.
+	rounds--;
+
+	// Number of times to go through the 36x loop.
+	size_t loops = rounds / 36;
+
+	// Number of rounds remaining after the 36x loop.
+	rounds %= 36;
+
+	// Lookup tables.
+	const __m256i lut0 = _mm256_set_epi8(
+		10, 11,  9, 10,  7,  8,  6,  7,  4,  5,  3,  4,  1,  2,  0,  1,
+		14, 15, 13, 14, 11, 12, 10, 11,  8,  9,  7,  8,  5,  6,  4,  5);
+
+	const __m256i lut1 = _mm256_setr_epi8(
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
+
+	// Temporary registers.
+	__m256i a, b, c, d, e;
+
+	// Temporary register f doubles as the shift mask for the first round.
+	__m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6);
+
+	__asm__ volatile (
+
+		// The first loop iteration requires special handling to ensure
+		// that the read, which is normally done at an offset of -4,
+		// does not underflow the buffer. Load the buffer at an offset
+		// of 0 and permute the input to achieve the same effect.
+		LOAD("a", 0, 0)
+		"vpermd %[a], %[f], %[a] \n\t"
+
+		// Perform the standard shuffling and translation steps.
+		SHUF("a", "b", "c")
+		TRAN("a", "b", "c")
+
+		// Store the result and increment the source and dest pointers.
+		"vmovdqu %[a], (%[dst]) \n\t"
+		"add     $24,  %[src]   \n\t"
+		"add     $32,  %[dst]   \n\t"
+
+		// If there are 36 rounds or more, enter a 36x unrolled loop of
+		// interleaved encoding rounds. The rounds interleave memory
+		// operations (load/store) with data operations (table lookups,
+		// etc) to maximize pipeline throughput.
+		"    test %[loops], %[loops] \n\t"
+		"    jz   18f                \n\t"
+		"    jmp  36f                \n\t"
+		"                            \n\t"
+		".balign 64                  \n\t"
+		"36: " ROUND_3_INIT()
+		"    " ROUND_3_A( 0)
+		"    " ROUND_3_B( 3)
+		"    " ROUND_3_A( 6)
+		"    " ROUND_3_B( 9)
+		"    " ROUND_3_A(12)
+		"    " ROUND_3_B(15)
+		"    " ROUND_3_A(18)
+		"    " ROUND_3_B(21)
+		"    " ROUND_3_A(24)
+		"    " ROUND_3_B(27)
+		"    " ROUND_3_A_LAST(30)
+		"    add $(24 * 36), %[src] \n\t"
+		"    add $(32 * 36), %[dst] \n\t"
+		"    dec %[loops]           \n\t"
+		"    jnz 36b                \n\t"
+
+		// Enter an 18x unrolled loop for rounds of 18 or more.
+		"18: cmp $18, %[rounds] \n\t"
+		"    jl  9f             \n\t"
+		"    " ROUND_3_INIT()
+		"    " ROUND_3_A(0)
+		"    " ROUND_3_B(3)
+		"    " ROUND_3_A(6)
+		"    " ROUND_3_B(9)
+		"    " ROUND_3_A_LAST(12)
+		"    sub $18,        %[rounds] \n\t"
+		"    add $(24 * 18), %[src]    \n\t"
+		"    add $(32 * 18), %[dst]    \n\t"
+
+		// Enter a 9x unrolled loop for rounds of 9 or more.
+		"9:  cmp $9, %[rounds] \n\t"
+		"    jl  6f            \n\t"
+		"    " ROUND_3_INIT()
+		"    " ROUND_3_A(0)
+		"    " ROUND_3_B_LAST(3)
+		"    sub $9,        %[rounds] \n\t"
+		"    add $(24 * 9), %[src]    \n\t"
+		"    add $(32 * 9), %[dst]    \n\t"
+
+		// Enter a 6x unrolled loop for rounds of 6 or more.
+		"6:  cmp $6, %[rounds] \n\t"
+		"    jl  55f           \n\t"
+		"    " ROUND_3_INIT()
+		"    " ROUND_3_A_LAST(0)
+		"    sub $6,        %[rounds] \n\t"
+		"    add $(24 * 6), %[src]    \n\t"
+		"    add $(32 * 6), %[dst]    \n\t"
+
+		// Dispatch the remaining rounds 0..5.
+		"55: cmp $3, %[rounds] \n\t"
+		"    jg  45f           \n\t"
+		"    je  3f            \n\t"
+		"    cmp $1, %[rounds] \n\t"
+		"    jg  2f            \n\t"
+		"    je  1f            \n\t"
+		"    jmp 0f            \n\t"
+
+		"45: cmp $4, %[rounds] \n\t"
+		"    je  4f            \n\t"
+
+		// Block of non-interlaced encoding rounds, which can each
+		// individually be jumped to. Rounds fall through to the next.
+		"5: " ROUND()
+		"4: " ROUND()
+		"3: " ROUND()
+		"2: " ROUND()
+		"1: " ROUND()
+		"0: \n\t"
+
+		// Outputs (modified).
+		: [rounds] "+r"  (rounds),
+		  [loops]  "+r"  (loops),
+		  [src]    "+r"  (*s),
+		  [dst]    "+r"  (*o),
+		  [a]      "=&x" (a),
+		  [b]      "=&x" (b),
+		  [c]      "=&x" (c),
+		  [d]      "=&x" (d),
+		  [e]      "=&x" (e),
+		  [f]      "+x"  (f)
+
+		// Inputs (not modified).
+		: [lut0] "x" (lut0),
+		  [lut1] "x" (lut1),
+		  [msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)),
+		  [msk1] "x" (_mm256_set1_epi32(0x04000040)),
+		  [msk2] "x" (_mm256_set1_epi32(0x003F03F0)),
+		  [msk3] "x" (_mm256_set1_epi32(0x01000010)),
+		  [n51]  "x" (_mm256_set1_epi8(51)),
+		  [n25]  "x" (_mm256_set1_epi8(25))
+
+		// Clobbers.
+		: "cc", "memory"
+	);
+}
+
+#pragma GCC diagnostic pop
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/enc_reshuffle.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/enc_reshuffle.c
@@ -0,0 +1,83 @@
+static BASE64_FORCE_INLINE __m256i
+enc_reshuffle (const __m256i input)
+{
+	// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
+	// works with shifted (4 bytes) input in order to be able to work
+	// efficiently in the two 128-bit lanes.
+
+	// Input, bytes MSB to LSB:
+	// 0 0 0 0 x w v u t s r q p o n m
+	// l k j i h g f e d c b a 0 0 0 0
+
+	const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
+		10, 11,  9, 10,
+		 7,  8,  6,  7,
+		 4,  5,  3,  4,
+		 1,  2,  0,  1,
+
+		14, 15, 13, 14,
+		11, 12, 10, 11,
+		 8,  9,  7,  8,
+		 5,  6,  4,  5));
+	// in, bytes MSB to LSB:
+	// w x v w
+	// t u s t
+	// q r p q
+	// n o m n
+	// k l j k
+	// h i g h
+	// e f d e
+	// b c a b
+
+	const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
+	// bits, upper case are most significant bits, lower case are least
+	// significant bits.
+	// 0000wwww XX000000 VVVVVV00 00000000
+	// 0000tttt UU000000 SSSSSS00 00000000
+	// 0000qqqq RR000000 PPPPPP00 00000000
+	// 0000nnnn OO000000 MMMMMM00 00000000
+	// 0000kkkk LL000000 JJJJJJ00 00000000
+	// 0000hhhh II000000 GGGGGG00 00000000
+	// 0000eeee FF000000 DDDDDD00 00000000
+	// 0000bbbb CC000000 AAAAAA00 00000000
+
+	const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
+	// 00000000 00wwwwXX 00000000 00VVVVVV
+	// 00000000 00ttttUU 00000000 00SSSSSS
+	// 00000000 00qqqqRR 00000000 00PPPPPP
+	// 00000000 00nnnnOO 00000000 00MMMMMM
+	// 00000000 00kkkkLL 00000000 00JJJJJJ
+	// 00000000 00hhhhII 00000000 00GGGGGG
+	// 00000000 00eeeeFF 00000000 00DDDDDD
+	// 00000000 00bbbbCC 00000000 00AAAAAA
+
+	const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
+	// 00000000 00xxxxxx 000000vv WWWW0000
+	// 00000000 00uuuuuu 000000ss TTTT0000
+	// 00000000 00rrrrrr 000000pp QQQQ0000
+	// 00000000 00oooooo 000000mm NNNN0000
+	// 00000000 00llllll 000000jj KKKK0000
+	// 00000000 00iiiiii 000000gg HHHH0000
+	// 00000000 00ffffff 000000dd EEEE0000
+	// 00000000 00cccccc 000000aa BBBB0000
+
+	const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
+	// 00xxxxxx 00000000 00vvWWWW 00000000
+	// 00uuuuuu 00000000 00ssTTTT 00000000
+	// 00rrrrrr 00000000 00ppQQQQ 00000000
+	// 00oooooo 00000000 00mmNNNN 00000000
+	// 00llllll 00000000 00jjKKKK 00000000
+	// 00iiiiii 00000000 00ggHHHH 00000000
+	// 00ffffff 00000000 00ddEEEE 00000000
+	// 00cccccc 00000000 00aaBBBB 00000000
+
+	return _mm256_or_si256(t1, t3);
+	// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
+	// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
+	// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
+	// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/enc_translate.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx2/enc_translate.c
@@ -0,0 +1,30 @@
+static BASE64_FORCE_INLINE __m256i
+enc_translate (const __m256i in)
+{
+	// A lookup table containing the absolute offsets for all ranges:
+	const __m256i lut = _mm256_setr_epi8(
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
+
+	// Translate values 0..63 to the Base64 alphabet. There are five sets:
+	// #  From      To         Abs    Index  Characters
+	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
+	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
+	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
+	// 3  [62]      [43]       -19       12  +
+	// 4  [63]      [47]       -16       13  /
+
+	// Create LUT indices from the input. The index for range #0 is right,
+	// others are 1 less than expected:
+	__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
+
+	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
+	const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
+
+	// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
+	// now correct:
+	indices = _mm256_sub_epi8(indices, mask);
+
+	// Add offsets to input values:
+	return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx512/codec.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx512/codec.c
@@ -0,0 +1,44 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_AVX512
+#include <immintrin.h>
+
+#include "../avx2/dec_reshuffle.c"
+#include "../avx2/dec_loop.c"
+#include "enc_reshuffle_translate.c"
+#include "enc_loop.c"
+
+#endif	// HAVE_AVX512
+
+void
+base64_stream_encode_avx512 BASE64_ENC_PARAMS
+{
+#if HAVE_AVX512
+	#include "../generic/enc_head.c"
+	enc_loop_avx512(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	base64_enc_stub(state, src, srclen, out, outlen);
+#endif
+}
+
+// Reuse AVX2 decoding. Not supporting AVX512 at present
+int
+base64_stream_decode_avx512 BASE64_DEC_PARAMS
+{
+#if HAVE_AVX512
+	#include "../generic/dec_head.c"
+	dec_loop_avx2(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	return base64_dec_stub(state, src, srclen, out, outlen);
+#endif
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx512/enc_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx512/enc_loop.c
@@ -0,0 +1,61 @@
+static BASE64_FORCE_INLINE void
+enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load input.
+	__m512i src = _mm512_loadu_si512((__m512i *) *s);
+
+	// Reshuffle, translate, store.
+	src = enc_reshuffle_translate(src);
+	_mm512_storeu_si512((__m512i *) *o, src);
+
+	*s += 48;
+	*o += 64;
+}
+
+static inline void
+enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 64) {
+		return;
+	}
+
+	// Process blocks of 48 bytes at a time. Because blocks are loaded 64
+	// bytes at a time, ensure that there will be at least 24 remaining
+	// bytes after the last round, so that the final read will not pass
+	// beyond the bounds of the input buffer.
+	size_t rounds = (*slen - 24) / 48;
+
+	*slen -= rounds * 48;   // 48 bytes consumed per round
+	*olen += rounds * 64;   // 64 bytes produced per round
+
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_avx512_inner(s, o);
+			enc_loop_avx512_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_avx512_inner(s, o);
+		break;
+	}
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx512/enc_reshuffle_translate.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/avx512/enc_reshuffle_translate.c
@@ -0,0 +1,50 @@
+// AVX512 algorithm is based on permutevar and multishift. The code is based on
+// https://github.com/WojciechMula/base64simd which is under BSD-2 license.
+
+static BASE64_FORCE_INLINE __m512i
+enc_reshuffle_translate (const __m512i input)
+{
+	// 32-bit input
+	// [ 0  0  0  0  0  0  0  0|c1 c0 d5 d4 d3 d2 d1 d0|
+	//  b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
+	// output order  [1, 2, 0, 1]
+	// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
+	//  a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
+
+	const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
+	                                                0x04050304,
+	                                                0x07080607,
+	                                                0x0a0b090a,
+	                                                0x0d0e0c0d,
+	                                                0x10110f10,
+	                                                0x13141213,
+	                                                0x16171516,
+	                                                0x191a1819,
+	                                                0x1c1d1b1c,
+	                                                0x1f201e1f,
+	                                                0x22232122,
+	                                                0x25262425,
+	                                                0x28292728,
+	                                                0x2b2c2a2b,
+	                                                0x2e2f2d2e);
+
+	// Reorder bytes
+	// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
+	//  a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
+	const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
+
+	// After multishift a single 32-bit lane has following layout
+	// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
+	//  a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
+	// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
+
+	// 48, 54, 36, 42, 16, 22, 4, 10
+	const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
+	__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
+
+	// Translate immediately after reshuffled.
+	const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
+
+	// Translation 6-bit values to ASCII.
+	return _mm512_permutexvar_epi8(shuffled_in, lookup);
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/32/dec_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/32/dec_loop.c
@@ -0,0 +1,86 @@
+static BASE64_FORCE_INLINE int
+dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+	const uint32_t str
+		= base64_table_dec_32bit_d0[(*s)[0]]
+		| base64_table_dec_32bit_d1[(*s)[1]]
+		| base64_table_dec_32bit_d2[(*s)[2]]
+		| base64_table_dec_32bit_d3[(*s)[3]];
+
+#if BASE64_LITTLE_ENDIAN
+
+	// LUTs for little-endian set MSB in case of invalid character:
+	if (str & UINT32_C(0x80000000)) {
+		return 0;
+	}
+#else
+	// LUTs for big-endian set LSB in case of invalid character:
+	if (str & UINT32_C(1)) {
+		return 0;
+	}
+#endif
+	// Store the output:
+	memcpy(*o, &str, sizeof (str));
+
+	*s += 4;
+	*o += 3;
+	*rounds -= 1;
+
+	return 1;
+}
+
+static inline void
+dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 8) {
+		return;
+	}
+
+	// Process blocks of 4 bytes per round. Because one extra zero byte is
+	// written after the output, ensure that there will be at least 4 bytes
+	// of input data left to cover the gap. (Two data bytes and up to two
+	// end-of-string markers.)
+	size_t rounds = (*slen - 4) / 4;
+
+	*slen -= rounds * 4;	// 4 bytes consumed per round
+	*olen += rounds * 3;	// 3 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			if (dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 4) {
+			if (dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 2) {
+			if (dec_loop_generic_32_inner(s, o, &rounds) &&
+			    dec_loop_generic_32_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		dec_loop_generic_32_inner(s, o, &rounds);
+		break;
+
+	} while (rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 4;
+	*olen -= rounds * 3;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/32/enc_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/32/enc_loop.c
@@ -0,0 +1,73 @@
+static BASE64_FORCE_INLINE void
+enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
+{
+	uint32_t src;
+
+	// Load input:
+	memcpy(&src, *s, sizeof (src));
+
+	// Reorder to 32-bit big-endian, if not already in that format. The
+	// workset must be in big-endian, otherwise the shifted bits do not
+	// carry over properly among adjacent bytes:
+	src = BASE64_HTOBE32(src);
+
+	// Two indices for the 12-bit lookup table:
+	const size_t index0 = (src >> 20) & 0xFFFU;
+	const size_t index1 = (src >>  8) & 0xFFFU;
+
+	// Table lookup and store:
+	memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
+	memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
+
+	*s += 3;
+	*o += 4;
+}
+
+static inline void
+enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 4) {
+		return;
+	}
+
+	// Process blocks of 3 bytes at a time. Because blocks are loaded 4
+	// bytes at a time, ensure that there will be at least one remaining
+	// byte after the last round, so that the final read will not pass
+	// beyond the bounds of the input buffer:
+	size_t rounds = (*slen - 1) / 3;
+
+	*slen -= rounds * 3;	// 3 bytes consumed per round
+	*olen += rounds * 4;	// 4 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_generic_32_inner(s, o);
+			enc_loop_generic_32_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_generic_32_inner(s, o);
+		break;
+
+	} while (rounds > 0);
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/64/enc_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/64/enc_loop.c
@@ -0,0 +1,77 @@
+static BASE64_FORCE_INLINE void
+enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
+{
+	uint64_t src;
+
+	// Load input:
+	memcpy(&src, *s, sizeof (src));
+
+	// Reorder to 64-bit big-endian, if not already in that format. The
+	// workset must be in big-endian, otherwise the shifted bits do not
+	// carry over properly among adjacent bytes:
+	src = BASE64_HTOBE64(src);
+
+	// Four indices for the 12-bit lookup table:
+	const size_t index0 = (src >> 52) & 0xFFFU;
+	const size_t index1 = (src >> 40) & 0xFFFU;
+	const size_t index2 = (src >> 28) & 0xFFFU;
+	const size_t index3 = (src >> 16) & 0xFFFU;
+
+	// Table lookup and store:
+	memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
+	memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
+	memcpy(*o + 4, base64_table_enc_12bit + index2, 2);
+	memcpy(*o + 6, base64_table_enc_12bit + index3, 2);
+
+	*s += 6;
+	*o += 8;
+}
+
+static inline void
+enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 8) {
+		return;
+	}
+
+	// Process blocks of 6 bytes at a time. Because blocks are loaded 8
+	// bytes at a time, ensure that there will be at least 2 remaining
+	// bytes after the last round, so that the final read will not pass
+	// beyond the bounds of the input buffer:
+	size_t rounds = (*slen - 2) / 6;
+
+	*slen -= rounds * 6;	// 6 bytes consumed per round
+	*olen += rounds * 8;	// 8 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_generic_64_inner(s, o);
+			enc_loop_generic_64_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_generic_64_inner(s, o);
+		break;
+
+	} while (rounds > 0);
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/codec.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/codec.c
@@ -0,0 +1,41 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if BASE64_WORDSIZE == 32
+#  include "32/enc_loop.c"
+#elif BASE64_WORDSIZE == 64
+#  include "64/enc_loop.c"
+#endif
+
+#if BASE64_WORDSIZE >= 32
+#  include "32/dec_loop.c"
+#endif
+
+void
+base64_stream_encode_plain BASE64_ENC_PARAMS
+{
+	#include "enc_head.c"
+#if BASE64_WORDSIZE == 32
+	enc_loop_generic_32(&s, &slen, &o, &olen);
+#elif BASE64_WORDSIZE == 64
+	enc_loop_generic_64(&s, &slen, &o, &olen);
+#endif
+	#include "enc_tail.c"
+}
+
+int
+base64_stream_decode_plain BASE64_DEC_PARAMS
+{
+	#include "dec_head.c"
+#if BASE64_WORDSIZE >= 32
+	dec_loop_generic_32(&s, &slen, &o, &olen);
+#endif
+	#include "dec_tail.c"
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/dec_head.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/dec_head.c
@@ -0,0 +1,37 @@
+int ret = 0;
+const uint8_t *s = (const uint8_t *) src;
+uint8_t *o = (uint8_t *) out;
+uint8_t q;
+
+// Use local temporaries to avoid cache thrashing:
+size_t olen = 0;
+size_t slen = srclen;
+struct base64_state st;
+st.eof = state->eof;
+st.bytes = state->bytes;
+st.carry = state->carry;
+
+// If we previously saw an EOF or an invalid character, bail out:
+if (st.eof) {
+	*outlen = 0;
+	ret = 0;
+	// If there was a trailing '=' to check, check it:
+	if (slen && (st.eof == BASE64_AEOF)) {
+		state->bytes = 0;
+		state->eof = BASE64_EOF;
+		ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0;
+	}
+	return ret;
+}
+
+// Turn four 6-bit numbers into three bytes:
+// out[0] = 11111122
+// out[1] = 22223333
+// out[2] = 33444444
+
+// Duff's device again:
+switch (st.bytes)
+{
+	for (;;)
+	{
+	case 0:
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/dec_tail.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/dec_tail.c
@@ -0,0 +1,91 @@
+		if (slen-- == 0) {
+			ret = 1;
+			break;
+		}
+		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
+			st.eof = BASE64_EOF;
+			// Treat character '=' as invalid for byte 0:
+			break;
+		}
+		st.carry = q << 2;
+		st.bytes++;
+
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+
+	case 1:	if (slen-- == 0) {
+			ret = 1;
+			break;
+		}
+		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
+			st.eof = BASE64_EOF;
+			// Treat character '=' as invalid for byte 1:
+			break;
+		}
+		*o++ = st.carry | (q >> 4);
+		st.carry = q << 4;
+		st.bytes++;
+		olen++;
+
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+
+	case 2:	if (slen-- == 0) {
+			ret = 1;
+			break;
+		}
+		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
+			st.bytes++;
+			// When q == 254, the input char is '='.
+			// Check if next byte is also '=':
+			if (q == 254) {
+				if (slen-- != 0) {
+					st.bytes = 0;
+					// EOF:
+					st.eof = BASE64_EOF;
+					q = base64_table_dec_8bit[*s++];
+					ret = ((q == 254) && (slen == 0)) ? 1 : 0;
+					break;
+				}
+				else {
+					// Almost EOF
+					st.eof = BASE64_AEOF;
+					ret = 1;
+					break;
+				}
+			}
+			// If we get here, there was an error:
+			break;
+		}
+		*o++ = st.carry | (q >> 2);
+		st.carry = q << 6;
+		st.bytes++;
+		olen++;
+
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+
+	case 3:	if (slen-- == 0) {
+			ret = 1;
+			break;
+		}
+		if ((q = base64_table_dec_8bit[*s++]) >= 254) {
+			st.bytes = 0;
+			st.eof = BASE64_EOF;
+			// When q == 254, the input char is '='. Return 1 and EOF.
+			// When q == 255, the input char is invalid. Return 0 and EOF.
+			ret = ((q == 254) && (slen == 0)) ? 1 : 0;
+			break;
+		}
+		*o++ = st.carry | q;
+		st.carry = 0;
+		st.bytes = 0;
+		olen++;
+	}
+}
+
+state->eof = st.eof;
+state->bytes = st.bytes;
+state->carry = st.carry;
+*outlen = olen;
+return ret;
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/enc_head.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/enc_head.c
@@ -0,0 +1,24 @@
+// Assume that *out is large enough to contain the output.
+// Theoretically it should be 4/3 the length of src.
+const uint8_t *s = (const uint8_t *) src;
+uint8_t *o = (uint8_t *) out;
+
+// Use local temporaries to avoid cache thrashing:
+size_t olen = 0;
+size_t slen = srclen;
+struct base64_state st;
+st.bytes = state->bytes;
+st.carry = state->carry;
+
+// Turn three bytes into four 6-bit numbers:
+// in[0] = 00111111
+// in[1] = 00112222
+// in[2] = 00222233
+// in[3] = 00333333
+
+// Duff's device, a for() loop inside a switch() statement. Legal!
+switch (st.bytes)
+{
+	for (;;)
+	{
+	case 0:
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/enc_tail.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/generic/enc_tail.c
@@ -0,0 +1,34 @@
+		if (slen-- == 0) {
+			break;
+		}
+		*o++ = base64_table_enc_6bit[*s >> 2];
+		st.carry = (*s++ << 4) & 0x30;
+		st.bytes++;
+		olen += 1;
+
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+
+	case 1:	if (slen-- == 0) {
+			break;
+		}
+		*o++ = base64_table_enc_6bit[st.carry | (*s >> 4)];
+		st.carry = (*s++ << 2) & 0x3C;
+		st.bytes++;
+		olen += 1;
+
+		// Deliberate fallthrough:
+		BASE64_FALLTHROUGH
+
+	case 2:	if (slen-- == 0) {
+			break;
+		}
+		*o++ = base64_table_enc_6bit[st.carry | (*s >> 6)];
+		*o++ = base64_table_enc_6bit[*s++ & 0x3F];
+		st.bytes = 0;
+		olen += 2;
+	}
+}
+state->bytes = st.bytes;
+state->carry = st.carry;
+*outlen = olen;
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/codec.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/codec.c
@@ -0,0 +1,79 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#ifdef __arm__
+#  if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
+#    define BASE64_USE_NEON32
+#  endif
+#endif
+
+#ifdef BASE64_USE_NEON32
+#include <arm_neon.h>
+
+// Only enable inline assembly on supported compilers.
+#if defined(__GNUC__) || defined(__clang__)
+#define BASE64_NEON32_USE_ASM
+#endif
+
+static BASE64_FORCE_INLINE uint8x16_t
+vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
+{
+	// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
+	// the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
+	uint8x8x2_t lut2;
+	uint8x8x2_t result;
+
+	lut2.val[0] = vget_low_u8(lut);
+	lut2.val[1] = vget_high_u8(lut);
+
+	result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
+	result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
+
+	return vcombine_u8(result.val[0], result.val[1]);
+}
+
+#include "../generic/32/dec_loop.c"
+#include "../generic/32/enc_loop.c"
+#include "dec_loop.c"
+#include "enc_reshuffle.c"
+#include "enc_translate.c"
+#include "enc_loop.c"
+
+#endif	// BASE64_USE_NEON32
+
+// Stride size is so large on these NEON 32-bit functions
+// (48 bytes encode, 32 bytes decode) that we inline the
+// uint32 codec to stay performant on smaller inputs.
+
+void
+base64_stream_encode_neon32 BASE64_ENC_PARAMS
+{
+#ifdef BASE64_USE_NEON32
+	#include "../generic/enc_head.c"
+	enc_loop_neon32(&s, &slen, &o, &olen);
+	enc_loop_generic_32(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	base64_enc_stub(state, src, srclen, out, outlen);
+#endif
+}
+
+int
+base64_stream_decode_neon32 BASE64_DEC_PARAMS
+{
+#ifdef BASE64_USE_NEON32
+	#include "../generic/dec_head.c"
+	dec_loop_neon32(&s, &slen, &o, &olen);
+	dec_loop_generic_32(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	return base64_dec_stub(state, src, srclen, out, outlen);
+#endif
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/dec_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/dec_loop.c
@@ -0,0 +1,106 @@
+static BASE64_FORCE_INLINE int
+is_nonzero (const uint8x16_t v)
+{
+	uint64_t u64;
+	const uint64x2_t v64 = vreinterpretq_u64_u8(v);
+	const uint32x2_t v32 = vqmovn_u64(v64);
+
+	vst1_u64(&u64, vreinterpret_u64_u32(v32));
+	return u64 != 0;
+}
+
+static BASE64_FORCE_INLINE uint8x16_t
+delta_lookup (const uint8x16_t v)
+{
+	const uint8x8_t lut = {
+		0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
+	};
+
+	return vcombine_u8(
+		vtbl1_u8(lut, vget_low_u8(v)),
+		vtbl1_u8(lut, vget_high_u8(v)));
+}
+
+static BASE64_FORCE_INLINE uint8x16_t
+dec_loop_neon32_lane (uint8x16_t *lane)
+{
+	// See the SSSE3 decoder for an explanation of the algorithm.
+	const uint8x16_t lut_lo = {
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
+	};
+
+	const uint8x16_t lut_hi = {
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
+	};
+
+	const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
+	const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
+
+	const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
+	const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
+	const uint8x16_t eq_2F      = vceqq_u8(*lane, mask_2F);
+
+	const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
+	const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
+
+	// Now simply add the delta values to the input:
+	*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
+
+	// Return the validity mask:
+	return vandq_u8(lo, hi);
+}
+
+static inline void
+dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 64) {
+		return;
+	}
+
+	// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
+	// extra trailing zero bytes are written, so it is not necessary to
+	// reserve extra input bytes:
+	size_t rounds = *slen / 64;
+
+	*slen -= rounds * 64;	// 64 bytes consumed per round
+	*olen += rounds * 48;	// 48 bytes produced per round
+
+	do {
+		uint8x16x3_t dec;
+
+		// Load 64 bytes and deinterleave:
+		uint8x16x4_t str = vld4q_u8(*s);
+
+		// Decode each lane, collect a mask of invalid inputs:
+		const uint8x16_t classified
+			= dec_loop_neon32_lane(&str.val[0])
+			| dec_loop_neon32_lane(&str.val[1])
+			| dec_loop_neon32_lane(&str.val[2])
+			| dec_loop_neon32_lane(&str.val[3]);
+
+		// Check for invalid input: if any of the delta values are
+		// zero, fall back on bytewise code to do error checking and
+		// reporting:
+		if (is_nonzero(classified)) {
+			break;
+		}
+
+		// Compress four bytes into three:
+		dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+		dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+		dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+
+		// Interleave and store decoded result:
+		vst3q_u8(*o, dec);
+
+		*s += 64;
+		*o += 48;
+
+	} while (--rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 64;
+	*olen -= rounds * 48;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/enc_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/enc_loop.c
@@ -0,0 +1,170 @@
+#ifdef BASE64_NEON32_USE_ASM
+static BASE64_FORCE_INLINE void
+enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
+{
+	// This function duplicates the functionality of enc_loop_neon32_inner,
+	// but entirely with inline assembly. This gives a significant speedup
+	// over using NEON intrinsics, which do not always generate very good
+	// code. The logic of the assembly is directly lifted from the
+	// intrinsics version, so it can be used as a guide to this code.
+
+	// Temporary registers, used as scratch space.
+	uint8x16_t tmp0, tmp1, tmp2, tmp3;
+	uint8x16_t mask0, mask1, mask2, mask3;
+
+	// A lookup table containing the absolute offsets for all ranges.
+	const uint8x16_t lut = {
+		  65U,  71U, 252U, 252U,
+		 252U, 252U, 252U, 252U,
+		 252U, 252U, 252U, 252U,
+		 237U, 240U,   0U,   0U
+	};
+
+	// Numeric constants.
+	const uint8x16_t n51 = vdupq_n_u8(51);
+	const uint8x16_t n25 = vdupq_n_u8(25);
+	const uint8x16_t n63 = vdupq_n_u8(63);
+
+	__asm__ (
+
+		// Load 48 bytes and deinterleave. The bytes are loaded to
+		// hard-coded registers q12, q13 and q14, to ensure that they
+		// are contiguous. Increment the source pointer.
+		"vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
+		"vld3.8 {d25, d27, d29}, [%[src]]! \n\t"
+
+		// Reshuffle the bytes using temporaries.
+		"vshr.u8 %q[t0], q12,    #2      \n\t"
+		"vshr.u8 %q[t1], q13,    #4      \n\t"
+		"vshr.u8 %q[t2], q14,    #6      \n\t"
+		"vsli.8  %q[t1], q12,    #4      \n\t"
+		"vsli.8  %q[t2], q13,    #2      \n\t"
+		"vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
+		"vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
+		"vand.u8 %q[t3], q14,    %q[n63] \n\t"
+
+		// t0..t3 are the reshuffled inputs. Create LUT indices.
+		"vqsub.u8 q12, %q[t0], %q[n51] \n\t"
+		"vqsub.u8 q13, %q[t1], %q[n51] \n\t"
+		"vqsub.u8 q14, %q[t2], %q[n51] \n\t"
+		"vqsub.u8 q15, %q[t3], %q[n51] \n\t"
+
+		// Create the mask for range #0.
+		"vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
+		"vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
+		"vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
+		"vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"
+
+		// Subtract -1 to correct the LUT indices.
+		"vsub.u8 q12, %q[m0] \n\t"
+		"vsub.u8 q13, %q[m1] \n\t"
+		"vsub.u8 q14, %q[m2] \n\t"
+		"vsub.u8 q15, %q[m3] \n\t"
+
+		// Lookup the delta values.
+		"vtbl.u8 d24, {%q[lut]}, d24 \n\t"
+		"vtbl.u8 d25, {%q[lut]}, d25 \n\t"
+		"vtbl.u8 d26, {%q[lut]}, d26 \n\t"
+		"vtbl.u8 d27, {%q[lut]}, d27 \n\t"
+		"vtbl.u8 d28, {%q[lut]}, d28 \n\t"
+		"vtbl.u8 d29, {%q[lut]}, d29 \n\t"
+		"vtbl.u8 d30, {%q[lut]}, d30 \n\t"
+		"vtbl.u8 d31, {%q[lut]}, d31 \n\t"
+
+		// Add the delta values.
+		"vadd.u8 q12, %q[t0] \n\t"
+		"vadd.u8 q13, %q[t1] \n\t"
+		"vadd.u8 q14, %q[t2] \n\t"
+		"vadd.u8 q15, %q[t3] \n\t"
+
+		// Store 64 bytes and interleave. Increment the dest pointer.
+		"vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
+		"vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"
+
+		// Outputs (modified).
+		: [src] "+r"  (*s),
+		  [dst] "+r"  (*o),
+		  [t0]  "=&w" (tmp0),
+		  [t1]  "=&w" (tmp1),
+		  [t2]  "=&w" (tmp2),
+		  [t3]  "=&w" (tmp3),
+		  [m0]  "=&w" (mask0),
+		  [m1]  "=&w" (mask1),
+		  [m2]  "=&w" (mask2),
+		  [m3]  "=&w" (mask3)
+
+		// Inputs (not modified).
+		: [lut] "w" (lut),
+		  [n25] "w" (n25),
+		  [n51] "w" (n51),
+		  [n63] "w" (n63)
+
+		// Clobbers.
+		: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
+		  "cc", "memory"
+	);
+}
+#endif
+
+static BASE64_FORCE_INLINE void
+enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
+{
+#ifdef BASE64_NEON32_USE_ASM
+	enc_loop_neon32_inner_asm(s, o);
+#else
+	// Load 48 bytes and deinterleave:
+	uint8x16x3_t src = vld3q_u8(*s);
+
+	// Reshuffle:
+	uint8x16x4_t out = enc_reshuffle(src);
+
+	// Translate reshuffled bytes to the Base64 alphabet:
+	out = enc_translate(out);
+
+	// Interleave and store output:
+	vst4q_u8(*o, out);
+
+	*s += 48;
+	*o += 64;
+#endif
+}
+
+static inline void
+enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	size_t rounds = *slen / 48;
+
+	*slen -= rounds * 48;	// 48 bytes consumed per round
+	*olen += rounds * 64;	// 64 bytes produced per round
+
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_neon32_inner(s, o);
+			enc_loop_neon32_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_neon32_inner(s, o);
+		break;
+	}
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/enc_reshuffle.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/enc_reshuffle.c
@@ -0,0 +1,31 @@
+static BASE64_FORCE_INLINE uint8x16x4_t
+enc_reshuffle (uint8x16x3_t in)
+{
+	uint8x16x4_t out;
+
+	// Input:
+	// in[0]  = a7 a6 a5 a4 a3 a2 a1 a0
+	// in[1]  = b7 b6 b5 b4 b3 b2 b1 b0
+	// in[2]  = c7 c6 c5 c4 c3 c2 c1 c0
+
+	// Output:
+	// out[0] = 00 00 a7 a6 a5 a4 a3 a2
+	// out[1] = 00 00 a1 a0 b7 b6 b5 b4
+	// out[2] = 00 00 b3 b2 b1 b0 c7 c6
+	// out[3] = 00 00 c5 c4 c3 c2 c1 c0
+
+	// Move the input bits to where they need to be in the outputs. Except
+	// for the first output, the high two bits are not cleared.
+	out.val[0] = vshrq_n_u8(in.val[0], 2);
+	out.val[1] = vshrq_n_u8(in.val[1], 4);
+	out.val[2] = vshrq_n_u8(in.val[2], 6);
+	out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
+	out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
+
+	// Clear the high two bits in the second, third and fourth output.
+	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
+	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
+	out.val[3] = vandq_u8(in.val[2],  vdupq_n_u8(0x3F));
+
+	return out;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/enc_translate.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon32/enc_translate.c
@@ -0,0 +1,57 @@
+static BASE64_FORCE_INLINE uint8x16x4_t
+enc_translate (const uint8x16x4_t in)
+{
+	// A lookup table containing the absolute offsets for all ranges:
+	const uint8x16_t lut = {
+		 65U,  71U, 252U, 252U,
+		252U, 252U, 252U, 252U,
+		252U, 252U, 252U, 252U,
+		237U, 240U,   0U,   0U
+	};
+
+	const uint8x16_t offset = vdupq_n_u8(51);
+
+	uint8x16x4_t indices, mask, delta, out;
+
+	// Translate values 0..63 to the Base64 alphabet. There are five sets:
+	// #  From      To         Abs    Index  Characters
+	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
+	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
+	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
+	// 3  [62]      [43]       -19       12  +
+	// 4  [63]      [47]       -16       13  /
+
+	// Create LUT indices from input:
+	// the index for range #0 is right, others are 1 less than expected:
+	indices.val[0] = vqsubq_u8(in.val[0], offset);
+	indices.val[1] = vqsubq_u8(in.val[1], offset);
+	indices.val[2] = vqsubq_u8(in.val[2], offset);
+	indices.val[3] = vqsubq_u8(in.val[3], offset);
+
+	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
+	mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
+	mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
+	mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
+	mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
+
+	// Subtract -1, so add 1 to indices for range #[1..4], All indices are
+	// now correct:
+	indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
+	indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
+	indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
+	indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
+
+	// Lookup delta values:
+	delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
+	delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
+	delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
+	delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
+
+	// Add delta values:
+	out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
+	out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
+	out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
+	out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
+
+	return out;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/codec.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/codec.c
@@ -0,0 +1,93 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <string.h>
+
+#include "libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_NEON64
+#include <arm_neon.h>
+
+// Only enable inline assembly on supported compilers.
+#if defined(__GNUC__) || defined(__clang__)
+#define BASE64_NEON64_USE_ASM
+#endif
+
+static BASE64_FORCE_INLINE uint8x16x4_t
+load_64byte_table (const uint8_t *p)
+{
+#ifdef BASE64_NEON64_USE_ASM
+
+	// Force the table to be loaded into contiguous registers. GCC will not
+	// normally allocate contiguous registers for a `uint8x16x4_t'. These
+	// registers are chosen to not conflict with the ones in the enc loop.
+	register uint8x16_t t0 __asm__ ("v8");
+	register uint8x16_t t1 __asm__ ("v9");
+	register uint8x16_t t2 __asm__ ("v10");
+	register uint8x16_t t3 __asm__ ("v11");
+
+	__asm__ (
+		"ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t"
+		: [src] "+r" (p),
+		  [t0]  "=w" (t0),
+		  [t1]  "=w" (t1),
+		  [t2]  "=w" (t2),
+		  [t3]  "=w" (t3)
+	);
+
+	return (uint8x16x4_t) {
+		.val[0] = t0,
+		.val[1] = t1,
+		.val[2] = t2,
+		.val[3] = t3,
+	};
+#else
+	return vld1q_u8_x4(p);
+#endif
+}
+
+#include "../generic/32/dec_loop.c"
+#include "../generic/64/enc_loop.c"
+#include "dec_loop.c"
+
+#ifdef BASE64_NEON64_USE_ASM
+# include "enc_loop_asm.c"
+#else
+# include "enc_reshuffle.c"
+# include "enc_loop.c"
+#endif
+
+#endif	// HAVE_NEON64
+
+// Stride size is so large on these NEON 64-bit functions
+// (48 bytes encode, 64 bytes decode) that we inline the
+// uint64 codec to stay performant on smaller inputs.
+
+void
+base64_stream_encode_neon64 BASE64_ENC_PARAMS
+{
+#if HAVE_NEON64
+	#include "../generic/enc_head.c"
+	enc_loop_neon64(&s, &slen, &o, &olen);
+	enc_loop_generic_64(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	base64_enc_stub(state, src, srclen, out, outlen);
+#endif
+}
+
+int
+base64_stream_decode_neon64 BASE64_DEC_PARAMS
+{
+#if HAVE_NEON64
+	#include "../generic/dec_head.c"
+	dec_loop_neon64(&s, &slen, &o, &olen);
+	dec_loop_generic_32(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	return base64_dec_stub(state, src, srclen, out, outlen);
+#endif
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/dec_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/dec_loop.c
@@ -0,0 +1,129 @@
+// The input consists of five valid character sets in the Base64 alphabet,
+// which we need to map back to the 6-bit values they represent.
+// There are three ranges, two singles, and then there's the rest.
+//
+//   #  From       To        LUT  Characters
+//   1  [0..42]    [255]      #1  invalid input
+//   2  [43]       [62]       #1  +
+//   3  [44..46]   [255]      #1  invalid input
+//   4  [47]       [63]       #1  /
+//   5  [48..57]   [52..61]   #1  0..9
+//   6  [58..63]   [255]      #1  invalid input
+//   7  [64]       [255]      #2  invalid input
+//   8  [65..90]   [0..25]    #2  A..Z
+//   9  [91..96]   [255]      #2  invalid input
+//  10  [97..122]  [26..51]   #2  a..z
+//  11  [123..126] [255]      #2  invalid input
+// (12) Everything else => invalid input
+
+// The first LUT will use the VTBL instruction (out of range indices are set to
+// 0 in destination).
+static const uint8_t dec_lut1[] = {
+	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
+	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
+	255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,  62U, 255U, 255U, 255U,  63U,
+	 52U,  53U,  54U,  55U,  56U,  57U,  58U,  59U,  60U,  61U, 255U, 255U, 255U, 255U, 255U, 255U,
+};
+
+// The second LUT will use the VTBX instruction (out of range indices will be
+// unchanged in destination). Input [64..126] will be mapped to index [1..63]
+// in this LUT. Index 0 means that value comes from LUT #1.
+static const uint8_t dec_lut2[] = {
+	  0U, 255U,   0U,   1U,   2U,   3U,   4U,   5U,   6U,   7U,   8U,   9U,  10U,  11U,  12U,  13U,
+	 14U,  15U,  16U,  17U,  18U,  19U,  20U,  21U,  22U,  23U,  24U,  25U, 255U, 255U, 255U, 255U,
+	255U, 255U,  26U,  27U,  28U,  29U,  30U,  31U,  32U,  33U,  34U,  35U,  36U,  37U,  38U,  39U,
+	 40U,  41U,  42U,  43U,  44U,  45U,  46U,  47U,  48U,  49U,  50U,  51U, 255U, 255U, 255U, 255U,
+};
+
+// All input values in range for the first look-up will be 0U in the second
+// look-up result. All input values out of range for the first look-up will be
+// 0U in the first look-up result. Thus, the two results can be ORed without
+// conflicts.
+//
+// Invalid characters that are in the valid range for either look-up will be
+// set to 255U in the combined result. Other invalid characters will just be
+// passed through with the second look-up result (using the VTBX instruction).
+// Since the second LUT is 64 bytes, those passed-through values are guaranteed
+// to have a value greater than 63U. Therefore, valid characters will be mapped
+// to the valid [0..63] range and all invalid characters will be mapped to
+// values greater than 63.
+
+static inline void
+dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 64) {
+		return;
+	}
+
+	// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
+	// extra trailing zero bytes are written, so it is not necessary to
+	// reserve extra input bytes:
+	size_t rounds = *slen / 64;
+
+	*slen -= rounds * 64;	// 64 bytes consumed per round
+	*olen += rounds * 48;	// 48 bytes produced per round
+
+	const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
+	const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
+
+	do {
+		const uint8x16_t offset = vdupq_n_u8(63U);
+		uint8x16x4_t dec1, dec2;
+		uint8x16x3_t dec;
+
+		// Load 64 bytes and deinterleave:
+		uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
+
+		// Get indices for second LUT:
+		dec2.val[0] = vqsubq_u8(str.val[0], offset);
+		dec2.val[1] = vqsubq_u8(str.val[1], offset);
+		dec2.val[2] = vqsubq_u8(str.val[2], offset);
+		dec2.val[3] = vqsubq_u8(str.val[3], offset);
+
+		// Get values from first LUT:
+		dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
+		dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
+		dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
+		dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
+
+		// Get values from second LUT:
+		dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
+		dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
+		dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
+		dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
+
+		// Get final values:
+		str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
+		str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
+		str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
+		str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
+
+		// Check for invalid input, any value larger than 63:
+		const uint8x16_t classified
+			= vorrq_u8(
+				vorrq_u8(vcgtq_u8(str.val[0], vdupq_n_u8(63)), vcgtq_u8(str.val[1], vdupq_n_u8(63))),
+				vorrq_u8(vcgtq_u8(str.val[2], vdupq_n_u8(63)), vcgtq_u8(str.val[3], vdupq_n_u8(63)))
+			);
+
+		// Check that all bits are zero:
+		if (vmaxvq_u8(classified) != 0U) {
+			break;
+		}
+
+		// Compress four bytes into three:
+		dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
+		dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
+		dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
+
+		// Interleave and store decoded result:
+		vst3q_u8((uint8_t *) *o, dec);
+
+		*s += 64;
+		*o += 48;
+
+	} while (--rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 64;
+	*olen -= rounds * 48;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/enc_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/enc_loop.c
@@ -0,0 +1,66 @@
+static BASE64_FORCE_INLINE void
+enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
+{
+	// Load 48 bytes and deinterleave:
+	uint8x16x3_t src = vld3q_u8(*s);
+
+	// Divide bits of three input bytes over four output bytes:
+	uint8x16x4_t out = enc_reshuffle(src);
+
+	// The bits have now been shifted to the right locations;
+	// translate their values 0..63 to the Base64 alphabet.
+	// Use a 64-byte table lookup:
+	out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]);
+	out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]);
+	out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]);
+	out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]);
+
+	// Interleave and store output:
+	vst4q_u8(*o, out);
+
+	*s += 48;
+	*o += 64;
+}
+
+static inline void
+enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	size_t rounds = *slen / 48;
+
+	*slen -= rounds * 48;	// 48 bytes consumed per round
+	*olen += rounds * 64;	// 64 bytes produced per round
+
+	// Load the encoding table:
+	const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
+
+	while (rounds > 0) {
+		if (rounds >= 8) {
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			enc_loop_neon64_inner(s, o, tbl_enc);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_neon64_inner(s, o, tbl_enc);
+		break;
+	}
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/enc_loop_asm.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/enc_loop_asm.c
@@ -0,0 +1,168 @@
+// Apologies in advance for combining the preprocessor with inline assembly,
+// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
+// code repetition. The preprocessor is used to template large sections of
+// inline assembly that differ only in the registers used. If the code was
+// written out by hand, it would become very large and hard to audit.
+
+// Generate a block of inline assembly that loads three user-defined registers
+// A, B, C from memory and deinterleaves them, post-incrementing the src
+// pointer. The register set should be sequential.
+#define LOAD(A, B, C) \
+	"ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t"
+
+// Generate a block of inline assembly that takes three deinterleaved registers
+// and shuffles the bytes. The output is in temporary registers t0..t3.
+#define SHUF(A, B, C) \
+	"ushr %[t0].16b, "A".16b,   #2         \n\t" \
+	"ushr %[t1].16b, "B".16b,   #4         \n\t" \
+	"ushr %[t2].16b, "C".16b,   #6         \n\t" \
+	"sli  %[t1].16b, "A".16b,   #4         \n\t" \
+	"sli  %[t2].16b, "B".16b,   #2         \n\t" \
+	"and  %[t1].16b, %[t1].16b, %[n63].16b \n\t" \
+	"and  %[t2].16b, %[t2].16b, %[n63].16b \n\t" \
+	"and  %[t3].16b, "C".16b,   %[n63].16b \n\t"
+
+// Generate a block of inline assembly that takes temporary registers t0..t3
+// and translates them to the base64 alphabet, using a table loaded into
+// v8..v11. The output is in user-defined registers A..D.
+#define TRAN(A, B, C, D) \
+	"tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \
+	"tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \
+	"tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \
+	"tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t"
+
+// Generate a block of inline assembly that interleaves four registers and
+// stores them, post-incrementing the destination pointer.
+#define STOR(A, B, C, D) \
+	"st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t"
+
+// Generate a block of inline assembly that generates a single self-contained
+// encoder round: fetch the data, process it, and store the result.
+#define ROUND() \
+	LOAD("v12", "v13", "v14") \
+	SHUF("v12", "v13", "v14") \
+	TRAN("v12", "v13", "v14", "v15") \
+	STOR("v12", "v13", "v14", "v15")
+
+// Generate a block of assembly that generates a type A interleaved encoder
+// round. It uses registers that were loaded by the previous type B round, and
+// in turn loads registers for the next type B round.
+#define ROUND_A() \
+	SHUF("v2",  "v3",  "v4") \
+	LOAD("v12", "v13", "v14") \
+	TRAN("v2",  "v3",  "v4", "v5") \
+	STOR("v2",  "v3",  "v4", "v5")
+
+// Type B interleaved encoder round. Same as type A, but register sets swapped.
+#define ROUND_B() \
+	SHUF("v12", "v13", "v14") \
+	LOAD("v2",  "v3",  "v4") \
+	TRAN("v12", "v13", "v14", "v15") \
+	STOR("v12", "v13", "v14", "v15")
+
+// The first type A round needs to load its own registers.
+#define ROUND_A_FIRST() \
+	LOAD("v2", "v3", "v4") \
+	ROUND_A()
+
+// The last type B round omits the load for the next step.
+#define ROUND_B_LAST() \
+	SHUF("v12", "v13", "v14") \
+	TRAN("v12", "v13", "v14", "v15") \
+	STOR("v12", "v13", "v14", "v15")
+
+// Suppress clang's warning that the literal string in the asm statement is
+// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
+// compilers). It may be true, but the goal here is not C99 portability.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+
+static inline void
+enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	size_t rounds = *slen / 48;
+
+	if (rounds == 0) {
+		return;
+	}
+
+	*slen -= rounds * 48;	// 48 bytes consumed per round.
+	*olen += rounds * 64;	// 64 bytes produced per round.
+
+	// Number of times to go through the 8x loop.
+	size_t loops = rounds / 8;
+
+	// Number of rounds remaining after the 8x loop.
+	rounds %= 8;
+
+	// Temporary registers, used as scratch space.
+	uint8x16_t tmp0, tmp1, tmp2, tmp3;
+
+	__asm__ volatile (
+
+		// Load the encoding table into v8..v11.
+		"    ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t"
+
+		// If there are eight rounds or more, enter an 8x unrolled loop
+		// of interleaved encoding rounds. The rounds interleave memory
+		// operations (load/store) with data operations to maximize
+		// pipeline throughput.
+		"    cbz %[loops], 4f \n\t"
+
+		// The SIMD instructions do not touch the flags.
+		"88: subs %[loops], %[loops], #1 \n\t"
+		"    " ROUND_A_FIRST()
+		"    " ROUND_B()
+		"    " ROUND_A()
+		"    " ROUND_B()
+		"    " ROUND_A()
+		"    " ROUND_B()
+		"    " ROUND_A()
+		"    " ROUND_B_LAST()
+		"    b.ne 88b \n\t"
+
+		// Enter a 4x unrolled loop for rounds of 4 or more.
+		"4:  cmp  %[rounds], #4 \n\t"
+		"    b.lt 30f           \n\t"
+		"    " ROUND_A_FIRST()
+		"    " ROUND_B()
+		"    " ROUND_A()
+		"    " ROUND_B_LAST()
+		"    sub %[rounds], %[rounds], #4 \n\t"
+
+		// Dispatch the remaining rounds 0..3.
+		"30: cbz  %[rounds], 0f \n\t"
+		"    cmp  %[rounds], #2 \n\t"
+		"    b.eq 2f            \n\t"
+		"    b.lt 1f            \n\t"
+
+		// Block of non-interlaced encoding rounds, which can each
+		// individually be jumped to. Rounds fall through to the next.
+		"3:  " ROUND()
+		"2:  " ROUND()
+		"1:  " ROUND()
+		"0:  \n\t"
+
+		// Outputs (modified).
+		: [loops] "+r"  (loops),
+		  [src]   "+r"  (*s),
+		  [dst]   "+r"  (*o),
+		  [t0]    "=&w" (tmp0),
+		  [t1]    "=&w" (tmp1),
+		  [t2]    "=&w" (tmp2),
+		  [t3]    "=&w" (tmp3)
+
+		// Inputs (not modified).
+		: [rounds] "r" (rounds),
+		  [tbl]    "r" (base64_table_enc_6bit),
+		  [n63]    "w" (vdupq_n_u8(63))
+
+		// Clobbers.
+		: "v2",  "v3",  "v4",  "v5",
+		  "v8",  "v9",  "v10", "v11",
+		  "v12", "v13", "v14", "v15",
+		  "cc", "memory"
+	);
+}
+
+#pragma GCC diagnostic pop
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/enc_reshuffle.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/neon64/enc_reshuffle.c
@@ -0,0 +1,31 @@
+static BASE64_FORCE_INLINE uint8x16x4_t
+enc_reshuffle (const uint8x16x3_t in)
+{
+	uint8x16x4_t out;
+
+	// Input:
+	// in[0]  = a7 a6 a5 a4 a3 a2 a1 a0
+	// in[1]  = b7 b6 b5 b4 b3 b2 b1 b0
+	// in[2]  = c7 c6 c5 c4 c3 c2 c1 c0
+
+	// Output:
+	// out[0] = 00 00 a7 a6 a5 a4 a3 a2
+	// out[1] = 00 00 a1 a0 b7 b6 b5 b4
+	// out[2] = 00 00 b3 b2 b1 b0 c7 c6
+	// out[3] = 00 00 c5 c4 c3 c2 c1 c0
+
+	// Move the input bits to where they need to be in the outputs. Except
+	// for the first output, the high two bits are not cleared.
+	out.val[0] = vshrq_n_u8(in.val[0], 2);
+	out.val[1] = vshrq_n_u8(in.val[1], 4);
+	out.val[2] = vshrq_n_u8(in.val[2], 6);
+	out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
+	out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
+
+	// Clear the high two bits in the second, third and fourth output.
+	out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
+	out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
+	out.val[3] = vandq_u8(in.val[2],  vdupq_n_u8(0x3F));
+
+	return out;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/sse41/codec.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/sse41/codec.c
@@ -0,0 +1,58 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_SSE41
+#include <smmintrin.h>
+
+// Only enable inline assembly on supported compilers and on 64-bit CPUs.
+#ifndef BASE64_SSE41_USE_ASM
+# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
+#  define BASE64_SSE41_USE_ASM 1
+# else
+#  define BASE64_SSE41_USE_ASM 0
+# endif
+#endif
+
+#include "../ssse3/dec_reshuffle.c"
+#include "../ssse3/dec_loop.c"
+
+#if BASE64_SSE41_USE_ASM
+# include "../ssse3/enc_loop_asm.c"
+#else
+# include "../ssse3/enc_translate.c"
+# include "../ssse3/enc_reshuffle.c"
+# include "../ssse3/enc_loop.c"
+#endif
+
+#endif	// HAVE_SSE41
+
+void
+base64_stream_encode_sse41 BASE64_ENC_PARAMS
+{
+#if HAVE_SSE41
+	#include "../generic/enc_head.c"
+	enc_loop_ssse3(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	base64_enc_stub(state, src, srclen, out, outlen);
+#endif
+}
+
+int
+base64_stream_decode_sse41 BASE64_DEC_PARAMS
+{
+#if HAVE_SSE41
+	#include "../generic/dec_head.c"
+	dec_loop_ssse3(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	return base64_dec_stub(state, src, srclen, out, outlen);
+#endif
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/sse42/codec.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/sse42/codec.c
@@ -0,0 +1,58 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_SSE42
+#include <nmmintrin.h>
+
+// Only enable inline assembly on supported compilers and on 64-bit CPUs.
+#ifndef BASE64_SSE42_USE_ASM
+# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
+#  define BASE64_SSE42_USE_ASM 1
+# else
+#  define BASE64_SSE42_USE_ASM 0
+# endif
+#endif
+
+#include "../ssse3/dec_reshuffle.c"
+#include "../ssse3/dec_loop.c"
+
+#if BASE64_SSE42_USE_ASM
+# include "../ssse3/enc_loop_asm.c"
+#else
+# include "../ssse3/enc_translate.c"
+# include "../ssse3/enc_reshuffle.c"
+# include "../ssse3/enc_loop.c"
+#endif
+
+#endif	// HAVE_SSE42
+
+void
+base64_stream_encode_sse42 BASE64_ENC_PARAMS
+{
+#if HAVE_SSE42
+	#include "../generic/enc_head.c"
+	enc_loop_ssse3(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	base64_enc_stub(state, src, srclen, out, outlen);
+#endif
+}
+
+int
+base64_stream_decode_sse42 BASE64_DEC_PARAMS
+{
+#if HAVE_SSE42
+	#include "../generic/dec_head.c"
+	dec_loop_ssse3(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	return base64_dec_stub(state, src, srclen, out, outlen);
+#endif
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/codec.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/codec.c
@@ -0,0 +1,60 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+
+#include "libbase64.h"
+#include "../../tables/tables.h"
+#include "../../codecs.h"
+#include "config.h"
+#include "../../env.h"
+
+#if HAVE_SSSE3
+#include <tmmintrin.h>
+
+// Only enable inline assembly on supported compilers and on 64-bit CPUs.
+// 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM
+// registers, which is not enough to run the inline assembly.
+#ifndef BASE64_SSSE3_USE_ASM
+# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
+#  define BASE64_SSSE3_USE_ASM 1
+# else
+#  define BASE64_SSSE3_USE_ASM 0
+# endif
+#endif
+
+#include "dec_reshuffle.c"
+#include "dec_loop.c"
+
+#if BASE64_SSSE3_USE_ASM
+# include "enc_loop_asm.c"
+#else
+# include "enc_reshuffle.c"
+# include "enc_translate.c"
+# include "enc_loop.c"
+#endif
+
+#endif	// HAVE_SSSE3
+
+void
+base64_stream_encode_ssse3 BASE64_ENC_PARAMS
+{
+#if HAVE_SSSE3
+	#include "../generic/enc_head.c"
+	enc_loop_ssse3(&s, &slen, &o, &olen);
+	#include "../generic/enc_tail.c"
+#else
+	base64_enc_stub(state, src, srclen, out, outlen);
+#endif
+}
+
+int
+base64_stream_decode_ssse3 BASE64_DEC_PARAMS
+{
+#if HAVE_SSSE3
+	#include "../generic/dec_head.c"
+	dec_loop_ssse3(&s, &slen, &o, &olen);
+	#include "../generic/dec_tail.c"
+#else
+	return base64_dec_stub(state, src, srclen, out, outlen);
+#endif
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/dec_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/dec_loop.c
@@ -0,0 +1,173 @@
+// The input consists of six character sets in the Base64 alphabet, which we
+// need to map back to the 6-bit values they represent. There are three ranges,
+// two singles, and then there's the rest.
+//
+//  #  From       To        Add  Characters
+//  1  [43]       [62]      +19  +
+//  2  [47]       [63]      +16  /
+//  3  [48..57]   [52..61]   +4  0..9
+//  4  [65..90]   [0..25]   -65  A..Z
+//  5  [97..122]  [26..51]  -71  a..z
+// (6) Everything else => invalid input
+//
+// We will use lookup tables for character validation and offset computation.
+// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
+// allows to mask with 0x2F instead of 0x0F and thus save one constant
+// declaration (register and/or memory access).
+//
+// For offsets:
+// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
+// 0000 = garbage
+// 0001 = /
+// 0010 = +
+// 0011 = 0-9
+// 0100 = A-Z
+// 0101 = A-Z
+// 0110 = a-z
+// 0111 = a-z
+// 1000 >= garbage
+//
+// For validation, here's the table.
+// A character is valid if and only if the AND of the 2 lookups equals 0:
+//
+// hi \ lo              0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
+//      LUT             0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
+//
+// 0000 0x10 char        NUL  SOH  STX  ETX  EOT  ENQ  ACK  BEL   BS   HT   LF   VT   FF   CR   SO   SI
+//           andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+//
+// 0001 0x10 char        DLE  DC1  DC2  DC3  DC4  NAK  SYN  ETB  CAN   EM  SUB  ESC   FS   GS   RS   US
+//           andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+//
+// 0010 0x01 char               !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
+//           andlut     0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
+//
+// 0011 0x02 char          0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
+//           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
+//
+// 0100 0x04 char          @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
+//           andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+//
+// 0101 0x08 char          P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
+//           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
+//
+// 0110 0x04 char          `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
+//           andlut     0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+// 0111 0x08 char          p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
+//           andlut     0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
+//
+// 1000 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+// 1001 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+// 1010 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+// 1011 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+// 1100 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+// 1101 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+// 1110 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+// 1111 0x10 andlut     0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
+
+static BASE64_FORCE_INLINE int
+dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
+{
+	const __m128i lut_lo = _mm_setr_epi8(
+		0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
+		0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
+
+	const __m128i lut_hi = _mm_setr_epi8(
+		0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
+		0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
+
+	const __m128i lut_roll = _mm_setr_epi8(
+		0,  16,  19,   4, -65, -65, -71, -71,
+		0,   0,   0,   0,   0,   0,   0,   0);
+
+	const __m128i mask_2F = _mm_set1_epi8(0x2F);
+
+	// Load input:
+	__m128i str = _mm_loadu_si128((__m128i *) *s);
+
+	// Table lookups:
+	const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
+	const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
+	const __m128i hi         = _mm_shuffle_epi8(lut_hi, hi_nibbles);
+	const __m128i lo         = _mm_shuffle_epi8(lut_lo, lo_nibbles);
+
+	// Check for invalid input: if any "and" values from lo and hi are not
+	// zero, fall back on bytewise code to do error checking and reporting:
+	if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
+		return 0;
+	}
+
+	const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
+	const __m128i roll  = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
+
+	// Now simply add the delta values to the input:
+	str = _mm_add_epi8(str, roll);
+
+	// Reshuffle the input to packed 12-byte output format:
+	str = dec_reshuffle(str);
+
+	// Store the output:
+	_mm_storeu_si128((__m128i *) *o, str);
+
+	*s += 16;
+	*o += 12;
+	*rounds -= 1;
+
+	return 1;
+}
+
+static inline void
+dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 24) {
+		return;
+	}
+
+	// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
+	// written after the output, ensure that there will be at least 8 bytes
+	// of input data left to cover the gap. (6 data bytes and up to two
+	// end-of-string markers.)
+	size_t rounds = (*slen - 8) / 16;
+
+	*slen -= rounds * 16;	// 16 bytes consumed per round
+	*olen += rounds * 12;	// 12 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			if (dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 4) {
+			if (dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		if (rounds >= 2) {
+			if (dec_loop_ssse3_inner(s, o, &rounds) &&
+			    dec_loop_ssse3_inner(s, o, &rounds)) {
+				continue;
+			}
+			break;
+		}
+		dec_loop_ssse3_inner(s, o, &rounds);
+		break;
+
+	} while (rounds > 0);
+
+	// Adjust for any rounds that were skipped:
+	*slen += rounds * 16;
+	*olen -= rounds * 12;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/dec_reshuffle.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/dec_reshuffle.c
@@ -0,0 +1,33 @@
+static BASE64_FORCE_INLINE __m128i
+dec_reshuffle (const __m128i in)
+{
+	// in, bits, upper case are most significant bits, lower case are least significant bits
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+
+	const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
+	// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
+	// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
+	// 0000eeee FFffffff 0000DDDD DDddEEEE
+	// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
+
+	const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
+	// 00000000 JJJJJJjj KKKKkkkk LLllllll
+	// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
+	// 00000000 DDDDDDdd EEEEeeee FFffffff
+	// 00000000 AAAAAAaa BBBBbbbb CCcccccc
+
+	// Pack bytes together:
+	return  _mm_shuffle_epi8(out, _mm_setr_epi8(
+		 2,  1,  0,
+		 6,  5,  4,
+		10,  9,  8,
+		14, 13, 12,
+		-1, -1, -1, -1));
+	// 00000000 00000000 00000000 00000000
+	// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
+	// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
+	// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/enc_loop.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/enc_loop.c
@@ -0,0 +1,67 @@
+static BASE64_FORCE_INLINE void
+enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
+{
+	// Load input:
+	__m128i str = _mm_loadu_si128((__m128i *) *s);
+
+	// Reshuffle:
+	str = enc_reshuffle(str);
+
+	// Translate reshuffled bytes to the Base64 alphabet:
+	str = enc_translate(str);
+
+	// Store:
+	_mm_storeu_si128((__m128i *) *o, str);
+
+	*s += 12;
+	*o += 16;
+}
+
+static inline void
+enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	if (*slen < 16) {
+		return;
+	}
+
+	// Process blocks of 12 bytes at a time. Because blocks are loaded 16
+	// bytes at a time, ensure that there will be at least 4 remaining
+	// bytes after the last round, so that the final read will not pass
+	// beyond the bounds of the input buffer:
+	size_t rounds = (*slen - 4) / 12;
+
+	*slen -= rounds * 12;	// 12 bytes consumed per round
+	*olen += rounds * 16;	// 16 bytes produced per round
+
+	do {
+		if (rounds >= 8) {
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			rounds -= 8;
+			continue;
+		}
+		if (rounds >= 4) {
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			rounds -= 4;
+			continue;
+		}
+		if (rounds >= 2) {
+			enc_loop_ssse3_inner(s, o);
+			enc_loop_ssse3_inner(s, o);
+			rounds -= 2;
+			continue;
+		}
+		enc_loop_ssse3_inner(s, o);
+		break;
+
+	} while (rounds > 0);
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/enc_loop_asm.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/enc_loop_asm.c
@@ -0,0 +1,268 @@
+// Apologies in advance for combining the preprocessor with inline assembly,
+// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
+// code repetition. The preprocessor is used to template large sections of
+// inline assembly that differ only in the registers used. If the code was
+// written out by hand, it would become very large and hard to audit.
+
+// Generate a block of inline assembly that loads register R0 from memory. The
+// offset at which the register is loaded is set by the given round.
+#define LOAD(R0, ROUND) \
+	"lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
+
+// Generate a block of inline assembly that deinterleaves and shuffles register
+// R0 using preloaded constants. Outputs in R0 and R1.
+#define SHUF(R0, R1) \
+	"pshufb  %[lut0], %["R0"] \n\t" \
+	"movdqa  %["R0"], %["R1"] \n\t" \
+	"pand    %[msk0], %["R0"] \n\t" \
+	"pand    %[msk2], %["R1"] \n\t" \
+	"pmulhuw %[msk1], %["R0"] \n\t" \
+	"pmullw  %[msk3], %["R1"] \n\t" \
+	"por     %["R1"], %["R0"] \n\t"
+
+// Generate a block of inline assembly that takes R0 and R1 and translates
+// their contents to the base64 alphabet, using preloaded constants.
+#define TRAN(R0, R1, R2) \
+	"movdqa  %["R0"], %["R1"] \n\t" \
+	"movdqa  %["R0"], %["R2"] \n\t" \
+	"psubusb %[n51],  %["R1"] \n\t" \
+	"pcmpgtb %[n25],  %["R2"] \n\t" \
+	"psubb   %["R2"], %["R1"] \n\t" \
+	"movdqa  %[lut1], %["R2"] \n\t" \
+	"pshufb  %["R1"], %["R2"] \n\t" \
+	"paddb   %["R2"], %["R0"] \n\t"
+
+// Generate a block of inline assembly that stores the given register R0 at an
+// offset set by the given round.
+#define STOR(R0, ROUND) \
+	"movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
+
+// Generate a block of inline assembly that generates a single self-contained
+// encoder round: fetch the data, process it, and store the result. Then update
+// the source and destination pointers.
+#define ROUND() \
+	LOAD("a", 0) \
+	SHUF("a", "b") \
+	TRAN("a", "b", "c") \
+	STOR("a", 0) \
+	"add $12, %[src] \n\t" \
+	"add $16, %[dst] \n\t"
+
+// Define a macro that initiates a three-way interleaved encoding round by
+// preloading registers a, b and c from memory.
+// The register graph shows which registers are in use during each step, and
+// is a visual aid for choosing registers for that step. Symbol index:
+//
+//  +  indicates that a register is loaded by that step.
+//  |  indicates that a register is in use and must not be touched.
+//  -  indicates that a register is decommissioned by that step.
+//  x  indicates that a register is used as a temporary by that step.
+//  V  indicates that a register is an input or output to the macro.
+//
+#define ROUND_3_INIT() 			/*  a b c d e f  */ \
+	LOAD("a", 0)			/*  +            */ \
+	SHUF("a", "d")			/*  |     +      */ \
+	LOAD("b", 1)			/*  | +   |      */ \
+	TRAN("a", "d", "e")		/*  | |   - x    */ \
+	LOAD("c", 2)			/*  V V V        */
+
+// Define a macro that translates, shuffles and stores the input registers A, B
+// and C, and preloads registers D, E and F for the next round.
+// This macro can be arbitrarily daisy-chained by feeding output registers D, E
+// and F back into the next round as input registers A, B and C. The macro
+// carefully interleaves memory operations with data operations for optimal
+// pipelined performance.
+
+#define ROUND_3(ROUND, A,B,C,D,E,F) 	/*  A B C D E F  */ \
+	LOAD(D, (ROUND + 3))		/*  V V V +      */ \
+	SHUF(B, E)			/*  | | | | +    */ \
+	STOR(A, (ROUND + 0))		/*  - | | | |    */ \
+	TRAN(B, E, F)			/*    | | | - x  */ \
+	LOAD(E, (ROUND + 4))		/*    | | | +    */ \
+	SHUF(C, A)			/*  + | | | |    */ \
+	STOR(B, (ROUND + 1))		/*  | - | | |    */ \
+	TRAN(C, A, F)			/*  -   | | | x  */ \
+	LOAD(F, (ROUND + 5))		/*      | | | +  */ \
+	SHUF(D, A)			/*  +   | | | |  */ \
+	STOR(C, (ROUND + 2))		/*  |   - | | |  */ \
+	TRAN(D, A, B)			/*  - x   V V V  */
+
+// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
+// registers D, E and F, and translating, shuffling and storing them.
+#define ROUND_3_END(ROUND, A,B,C,D,E,F)	/*  A B C D E F  */ \
+	SHUF(E, A)			/*  +     V V V  */ \
+	STOR(D, (ROUND + 3))		/*  |     - | |  */ \
+	TRAN(E, A, B)			/*  - x     | |  */ \
+	SHUF(F, C)			/*      +   | |  */ \
+	STOR(E, (ROUND + 4))		/*      |   - |  */ \
+	TRAN(F, C, D)			/*      - x   |  */ \
+	STOR(F, (ROUND + 5))		/*            -  */
+
+// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
+#define ROUND_3_A(ROUND) \
+	ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
+
+// Define a type B round. Inputs and outputs are swapped with regard to type A.
+#define ROUND_3_B(ROUND) \
+	ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
+
+// Terminating macro for a type A round.
+#define ROUND_3_A_LAST(ROUND) \
+	ROUND_3_A(ROUND) \
+	ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
+
+// Terminating macro for a type B round.
+#define ROUND_3_B_LAST(ROUND) \
+	ROUND_3_B(ROUND) \
+	ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
+
+// Suppress clang's warning that the literal string in the asm statement is
+// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
+// compilers). It may be true, but the goal here is not C99 portability.
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Woverlength-strings"
+
+static inline void
+enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
+{
+	// For a clearer explanation of the algorithm used by this function,
+	// please refer to the plain (not inline assembly) implementation. This
+	// function follows the same basic logic.
+
+	if (*slen < 16) {
+		return;
+	}
+
+	// Process blocks of 12 bytes at a time. Input is read in blocks of 16
+	// bytes, so "reserve" four bytes from the input buffer to ensure that
+	// we never read beyond the end of the input buffer.
+	size_t rounds = (*slen - 4) / 12;
+
+	*slen -= rounds * 12;   // 12 bytes consumed per round
+	*olen += rounds * 16;   // 16 bytes produced per round
+
+	// Number of times to go through the 36x loop.
+	size_t loops = rounds / 36;
+
+	// Number of rounds remaining after the 36x loop.
+	rounds %= 36;
+
+	// Lookup tables.
+	const __m128i lut0 = _mm_set_epi8(
+		10, 11,  9, 10,  7,  8,  6,  7,  4,  5,  3,  4,  1,  2,  0,  1);
+
+	const __m128i lut1 = _mm_setr_epi8(
+		65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
+
+	// Temporary registers.
+	__m128i a, b, c, d, e, f;
+
+	__asm__ volatile (
+
+		// If there are 36 rounds or more, enter a 36x unrolled loop of
+		// interleaved encoding rounds. The rounds interleave memory
+		// operations (load/store) with data operations (table lookups,
+		// etc) to maximize pipeline throughput.
+		"    test %[loops], %[loops] \n\t"
+		"    jz   18f                \n\t"
+		"    jmp  36f                \n\t"
+		"                            \n\t"
+		".balign 64                  \n\t"
+		"36: " ROUND_3_INIT()
+		"    " ROUND_3_A( 0)
+		"    " ROUND_3_B( 3)
+		"    " ROUND_3_A( 6)
+		"    " ROUND_3_B( 9)
+		"    " ROUND_3_A(12)
+		"    " ROUND_3_B(15)
+		"    " ROUND_3_A(18)
+		"    " ROUND_3_B(21)
+		"    " ROUND_3_A(24)
+		"    " ROUND_3_B(27)
+		"    " ROUND_3_A_LAST(30)
+		"    add $(12 * 36), %[src] \n\t"
+		"    add $(16 * 36), %[dst] \n\t"
+		"    dec %[loops]           \n\t"
+		"    jnz 36b                \n\t"
+
+		// Enter an 18x unrolled loop for rounds of 18 or more.
+		"18: cmp $18, %[rounds] \n\t"
+		"    jl  9f             \n\t"
+		"    " ROUND_3_INIT()
+		"    " ROUND_3_A(0)
+		"    " ROUND_3_B(3)
+		"    " ROUND_3_A(6)
+		"    " ROUND_3_B(9)
+		"    " ROUND_3_A_LAST(12)
+		"    sub $18,        %[rounds] \n\t"
+		"    add $(12 * 18), %[src]    \n\t"
+		"    add $(16 * 18), %[dst]    \n\t"
+
+		// Enter a 9x unrolled loop for rounds of 9 or more.
+		"9:  cmp $9, %[rounds] \n\t"
+		"    jl  6f            \n\t"
+		"    " ROUND_3_INIT()
+		"    " ROUND_3_A(0)
+		"    " ROUND_3_B_LAST(3)
+		"    sub $9,        %[rounds] \n\t"
+		"    add $(12 * 9), %[src]    \n\t"
+		"    add $(16 * 9), %[dst]    \n\t"
+
+		// Enter a 6x unrolled loop for rounds of 6 or more.
+		"6:  cmp $6, %[rounds] \n\t"
+		"    jl  55f           \n\t"
+		"    " ROUND_3_INIT()
+		"    " ROUND_3_A_LAST(0)
+		"    sub $6,        %[rounds] \n\t"
+		"    add $(12 * 6), %[src]    \n\t"
+		"    add $(16 * 6), %[dst]    \n\t"
+
+		// Dispatch the remaining rounds 0..5.
+		"55: cmp $3, %[rounds] \n\t"
+		"    jg  45f           \n\t"
+		"    je  3f            \n\t"
+		"    cmp $1, %[rounds] \n\t"
+		"    jg  2f            \n\t"
+		"    je  1f            \n\t"
+		"    jmp 0f            \n\t"
+
+		"45: cmp $4, %[rounds] \n\t"
+		"    je  4f            \n\t"
+
+		// Block of non-interlaced encoding rounds, which can each
+		// individually be jumped to. Rounds fall through to the next.
+		"5: " ROUND()
+		"4: " ROUND()
+		"3: " ROUND()
+		"2: " ROUND()
+		"1: " ROUND()
+		"0: \n\t"
+
+		// Outputs (modified).
+		: [rounds] "+r"  (rounds),
+		  [loops]  "+r"  (loops),
+		  [src]    "+r"  (*s),
+		  [dst]    "+r"  (*o),
+		  [a]      "=&x" (a),
+		  [b]      "=&x" (b),
+		  [c]      "=&x" (c),
+		  [d]      "=&x" (d),
+		  [e]      "=&x" (e),
+		  [f]      "=&x" (f)
+
+		// Inputs (not modified).
+		: [lut0] "x" (lut0),
+		  [lut1] "x" (lut1),
+		  [msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
+		  [msk1] "x" (_mm_set1_epi32(0x04000040)),
+		  [msk2] "x" (_mm_set1_epi32(0x003F03F0)),
+		  [msk3] "x" (_mm_set1_epi32(0x01000010)),
+		  [n51]  "x" (_mm_set1_epi8(51)),
+		  [n25]  "x" (_mm_set1_epi8(25))
+
+		// Clobbers.
+		: "cc", "memory"
+	);
+}
+
+#pragma GCC diagnostic pop
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/enc_reshuffle.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/enc_reshuffle.c
@@ -0,0 +1,48 @@
+static BASE64_FORCE_INLINE __m128i
+enc_reshuffle (__m128i in)
+{
+	// Input, bytes MSB to LSB:
+	// 0 0 0 0 l k j i h g f e d c b a
+
+	in = _mm_shuffle_epi8(in, _mm_set_epi8(
+		10, 11,  9, 10,
+		 7,  8,  6,  7,
+		 4,  5,  3,  4,
+		 1,  2,  0,  1));
+	// in, bytes MSB to LSB:
+	// k l j k
+	// h i g h
+	// e f d e
+	// b c a b
+
+	const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00));
+	// bits, upper case are most significant bits, lower case are least significant bits
+	// 0000kkkk LL000000 JJJJJJ00 00000000
+	// 0000hhhh II000000 GGGGGG00 00000000
+	// 0000eeee FF000000 DDDDDD00 00000000
+	// 0000bbbb CC000000 AAAAAA00 00000000
+
+	const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
+	// 00000000 00kkkkLL 00000000 00JJJJJJ
+	// 00000000 00hhhhII 00000000 00GGGGGG
+	// 00000000 00eeeeFF 00000000 00DDDDDD
+	// 00000000 00bbbbCC 00000000 00AAAAAA
+
+	const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0));
+	// 00000000 00llllll 000000jj KKKK0000
+	// 00000000 00iiiiii 000000gg HHHH0000
+	// 00000000 00ffffff 000000dd EEEE0000
+	// 00000000 00cccccc 000000aa BBBB0000
+
+	const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
+	// 00llllll 00000000 00jjKKKK 00000000
+	// 00iiiiii 00000000 00ggHHHH 00000000
+	// 00ffffff 00000000 00ddEEEE 00000000
+	// 00cccccc 00000000 00aaBBBB 00000000
+
+	return _mm_or_si128(t1, t3);
+	// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
+	// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
+	// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
+	// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/enc_translate.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/arch/ssse3/enc_translate.c
@@ -0,0 +1,33 @@
+static BASE64_FORCE_INLINE __m128i
+enc_translate (const __m128i in)
+{
+	// A lookup table containing the absolute offsets for all ranges:
+	const __m128i lut = _mm_setr_epi8(
+		 65,  71, -4, -4,
+		 -4,  -4, -4, -4,
+		 -4,  -4, -4, -4,
+		-19, -16,  0,  0
+	);
+
+	// Translate values 0..63 to the Base64 alphabet. There are five sets:
+	// #  From      To         Abs    Index  Characters
+	// 0  [0..25]   [65..90]   +65        0  ABCDEFGHIJKLMNOPQRSTUVWXYZ
+	// 1  [26..51]  [97..122]  +71        1  abcdefghijklmnopqrstuvwxyz
+	// 2  [52..61]  [48..57]    -4  [2..11]  0123456789
+	// 3  [62]      [43]       -19       12  +
+	// 4  [63]      [47]       -16       13  /
+
+	// Create LUT indices from the input. The index for range #0 is right,
+	// others are 1 less than expected:
+	__m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
+
+	// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
+	__m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
+
+	// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
+	// now correct:
+	indices = _mm_sub_epi8(indices, mask);
+
+	// Add offsets to input values:
+	return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/codec_choose.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/codec_choose.c
@@ -0,0 +1,314 @@
+#include <stdbool.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "libbase64.h"
+#include "codecs.h"
+#include "config.h"
+#include "env.h"
+
+#if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
+  #define BASE64_X86
+  #if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512)
+    #define BASE64_X86_SIMD
+  #endif
+#endif
+
+#ifdef BASE64_X86
+#ifdef _MSC_VER
+	#include <intrin.h>
+	#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
+	{						\
+		int info[4];				\
+		__cpuidex(info, __level, __count);	\
+		__eax = info[0];			\
+		__ebx = info[1];			\
+		__ecx = info[2];			\
+		__edx = info[3];			\
+	}
+	#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
+		__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
+#else
+	#include <cpuid.h>
+	#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
+		#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
+			static inline uint64_t _xgetbv (uint32_t index)
+			{
+				uint32_t eax, edx;
+				__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
+				return ((uint64_t)edx << 32) | eax;
+			}
+		#else
+			#error "Platform not supported"
+		#endif
+	#endif
+#endif
+
+#ifndef bit_AVX512vl
+#define bit_AVX512vl (1 << 31)
+#endif
+#ifndef bit_AVX512vbmi
+#define bit_AVX512vbmi (1 << 1)
+#endif
+#ifndef bit_AVX2
+#define bit_AVX2 (1 << 5)
+#endif
+#ifndef bit_SSSE3
+#define bit_SSSE3 (1 << 9)
+#endif
+#ifndef bit_SSE41
+#define bit_SSE41 (1 << 19)
+#endif
+#ifndef bit_SSE42
+#define bit_SSE42 (1 << 20)
+#endif
+#ifndef bit_AVX
+#define bit_AVX (1 << 28)
+#endif
+
+#define bit_XSAVE_XRSTORE (1 << 27)
+
+#ifndef _XCR_XFEATURE_ENABLED_MASK
+#define _XCR_XFEATURE_ENABLED_MASK 0
+#endif
+
+#define bit_XMM      (1 << 1)
+#define bit_YMM      (1 << 2)
+#define bit_OPMASK   (1 << 5)
+#define bit_ZMM      (1 << 6)
+#define bit_HIGH_ZMM (1 << 7)
+
+#define _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS (bit_XMM | bit_YMM)
+
+#define _AVX_512_ENABLED_BY_OS (bit_XMM | bit_YMM | bit_OPMASK | bit_ZMM | bit_HIGH_ZMM)
+
+#endif
+
+// Function declarations:
+#define BASE64_CODEC_FUNCS(arch)					\
+	extern void base64_stream_encode_ ## arch BASE64_ENC_PARAMS;	\
+	extern int  base64_stream_decode_ ## arch BASE64_DEC_PARAMS;
+
+BASE64_CODEC_FUNCS(avx512)
+BASE64_CODEC_FUNCS(avx2)
+BASE64_CODEC_FUNCS(neon32)
+BASE64_CODEC_FUNCS(neon64)
+BASE64_CODEC_FUNCS(plain)
+BASE64_CODEC_FUNCS(ssse3)
+BASE64_CODEC_FUNCS(sse41)
+BASE64_CODEC_FUNCS(sse42)
+BASE64_CODEC_FUNCS(avx)
+
+static bool
+codec_choose_forced (struct codec *codec, int flags)
+{
+	// If the user wants to use a certain codec,
+	// always allow it, even if the codec is a no-op.
+	// For testing purposes.
+
+	if (!(flags & 0xFFFF)) {
+		return false;
+	}
+
+	if (flags & BASE64_FORCE_AVX2) {
+		codec->enc = base64_stream_encode_avx2;
+		codec->dec = base64_stream_decode_avx2;
+		return true;
+	}
+	if (flags & BASE64_FORCE_NEON32) {
+		codec->enc = base64_stream_encode_neon32;
+		codec->dec = base64_stream_decode_neon32;
+		return true;
+	}
+	if (flags & BASE64_FORCE_NEON64) {
+		codec->enc = base64_stream_encode_neon64;
+		codec->dec = base64_stream_decode_neon64;
+		return true;
+	}
+	if (flags & BASE64_FORCE_PLAIN) {
+		codec->enc = base64_stream_encode_plain;
+		codec->dec = base64_stream_decode_plain;
+		return true;
+	}
+	if (flags & BASE64_FORCE_SSSE3) {
+		codec->enc = base64_stream_encode_ssse3;
+		codec->dec = base64_stream_decode_ssse3;
+		return true;
+	}
+	if (flags & BASE64_FORCE_SSE41) {
+		codec->enc = base64_stream_encode_sse41;
+		codec->dec = base64_stream_decode_sse41;
+		return true;
+	}
+	if (flags & BASE64_FORCE_SSE42) {
+		codec->enc = base64_stream_encode_sse42;
+		codec->dec = base64_stream_decode_sse42;
+		return true;
+	}
+	if (flags & BASE64_FORCE_AVX) {
+		codec->enc = base64_stream_encode_avx;
+		codec->dec = base64_stream_decode_avx;
+		return true;
+	}
+	if (flags & BASE64_FORCE_AVX512) {
+		codec->enc = base64_stream_encode_avx512;
+		codec->dec = base64_stream_decode_avx512;
+		return true;
+	}
+	return false;
+}
+
+static bool
+codec_choose_arm (struct codec *codec)
+{
+#if HAVE_NEON64 || ((defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32)
+
+	// Unfortunately there is no portable way to check for NEON
+	// support at runtime from userland in the same way that x86
+	// has cpuid, so just stick to the compile-time configuration:
+
+	#if HAVE_NEON64
+	codec->enc = base64_stream_encode_neon64;
+	codec->dec = base64_stream_decode_neon64;
+	#else
+	codec->enc = base64_stream_encode_neon32;
+	codec->dec = base64_stream_decode_neon32;
+	#endif
+
+	return true;
+
+#else
+	(void)codec;
+	return false;
+#endif
+}
+
+static bool
+codec_choose_x86 (struct codec *codec)
+{
+#ifdef BASE64_X86_SIMD
+
+	unsigned int eax, ebx = 0, ecx = 0, edx;
+	unsigned int max_level;
+
+	#ifdef _MSC_VER
+	int info[4];
+	__cpuidex(info, 0, 0);
+	max_level = info[0];
+	#else
+	max_level = __get_cpuid_max(0, NULL);
+	#endif
+
+	#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
+	// Check for AVX/AVX2/AVX512 support:
+	// Checking for AVX requires 3 things:
+	// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
+	//    (allowing saving YMM registers on context switch)
+	// 2) CPUID indicates support for AVX
+	// 3) XGETBV indicates the AVX registers will be saved and restored on
+	//    context switch
+	//
+	// Note that XGETBV is only available on 686 or later CPUs, so the
+	// instruction needs to be conditionally run.
+	if (max_level >= 1) {
+		__cpuid_count(1, 0, eax, ebx, ecx, edx);
+		if (ecx & bit_XSAVE_XRSTORE) {
+			uint64_t xcr_mask;
+			xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
+			if ((xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) == _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { // check multiple bits at once
+				#if HAVE_AVX512
+				if (max_level >= 7 && ((xcr_mask & _AVX_512_ENABLED_BY_OS) == _AVX_512_ENABLED_BY_OS)) {
+					__cpuid_count(7, 0, eax, ebx, ecx, edx);
+					if ((ebx & bit_AVX512vl) && (ecx & bit_AVX512vbmi)) {
+						codec->enc = base64_stream_encode_avx512;
+						codec->dec = base64_stream_decode_avx512;
+						return true;
+					}
+				}
+				#endif
+				#if HAVE_AVX2
+				if (max_level >= 7) {
+					__cpuid_count(7, 0, eax, ebx, ecx, edx);
+					if (ebx & bit_AVX2) {
+						codec->enc = base64_stream_encode_avx2;
+						codec->dec = base64_stream_decode_avx2;
+						return true;
+					}
+				}
+				#endif
+				#if HAVE_AVX
+				__cpuid_count(1, 0, eax, ebx, ecx, edx);
+				if (ecx & bit_AVX) {
+					codec->enc = base64_stream_encode_avx;
+					codec->dec = base64_stream_decode_avx;
+					return true;
+				}
+				#endif
+			}
+		}
+	}
+	#endif
+
+	#if HAVE_SSE42
+	// Check for SSE42 support:
+	if (max_level >= 1) {
+		__cpuid(1, eax, ebx, ecx, edx);
+		if (ecx & bit_SSE42) {
+			codec->enc = base64_stream_encode_sse42;
+			codec->dec = base64_stream_decode_sse42;
+			return true;
+		}
+	}
+	#endif
+
+	#if HAVE_SSE41
+	// Check for SSE41 support:
+	if (max_level >= 1) {
+		__cpuid(1, eax, ebx, ecx, edx);
+		if (ecx & bit_SSE41) {
+			codec->enc = base64_stream_encode_sse41;
+			codec->dec = base64_stream_decode_sse41;
+			return true;
+		}
+	}
+	#endif
+
+	#if HAVE_SSSE3
+	// Check for SSSE3 support:
+	if (max_level >= 1) {
+		__cpuid(1, eax, ebx, ecx, edx);
+		if (ecx & bit_SSSE3) {
+			codec->enc = base64_stream_encode_ssse3;
+			codec->dec = base64_stream_decode_ssse3;
+			return true;
+		}
+	}
+	#endif
+
+#else
+	(void)codec;
+#endif
+
+	return false;
+}
+
+void
+codec_choose (struct codec *codec, int flags)
+{
+	// User forced a codec:
+	if (codec_choose_forced(codec, flags)) {
+		return;
+	}
+
+	// Runtime feature detection:
+	if (codec_choose_arm(codec)) {
+		return;
+	}
+	if (codec_choose_x86(codec)) {
+		return;
+	}
+	codec->enc = base64_stream_encode_plain;
+	codec->dec = base64_stream_decode_plain;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/codecs.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/codecs.h
@@ -0,0 +1,57 @@
+#include "libbase64.h"
+
+// Function parameters for encoding functions:
+#define BASE64_ENC_PARAMS			\
+	( struct base64_state	*state		\
+	, const char		*src		\
+	, size_t		 srclen		\
+	, char			*out		\
+	, size_t		*outlen		\
+	)
+
+// Function parameters for decoding functions:
+#define BASE64_DEC_PARAMS			\
+	( struct base64_state	*state		\
+	, const char		*src		\
+	, size_t		 srclen		\
+	, char			*out		\
+	, size_t		*outlen		\
+	)
+
+// This function is used as a stub when a certain encoder is not compiled in.
+// It discards the inputs and returns zero output bytes.
+static inline void
+base64_enc_stub BASE64_ENC_PARAMS
+{
+	(void) state;
+	(void) src;
+	(void) srclen;
+	(void) out;
+
+	*outlen = 0;
+}
+
+// This function is used as a stub when a certain decoder is not compiled in.
+// It discards the inputs and returns an invalid decoding result.
+static inline int
+base64_dec_stub BASE64_DEC_PARAMS
+{
+	(void) state;
+	(void) src;
+	(void) srclen;
+	(void) out;
+	(void) outlen;
+
+	return -1;
+}
+
+typedef void (* base64_enc_fn) BASE64_ENC_PARAMS;
+typedef int  (* base64_dec_fn) BASE64_DEC_PARAMS;
+
+struct codec
+{
+	base64_enc_fn enc;
+	base64_dec_fn dec;
+};
+
+extern void codec_choose (struct codec *, int flags);
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/config.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/config.h
@@ -0,0 +1,24 @@
+#ifndef BASE64_CONFIG_H
+#define BASE64_CONFIG_H
+
+#if !defined(__APPLE__) && ((defined(__x86_64__) && defined(__LP64__)) || defined(_M_X64))
+  #define HAVE_SSSE3 1
+  #define HAVE_SSE41 1
+  #define HAVE_SSE42 1
+  #define HAVE_AVX 1
+  #define HAVE_AVX2 1
+  #define HAVE_AVX512 0
+#endif
+
+#define BASE64_WITH_NEON32 0
+#define HAVE_NEON32 BASE64_WITH_NEON32
+
+#if defined(__APPLE__) && defined(__aarch64__)
+#define BASE64_WITH_NEON64 1
+#else
+#define BASE64_WITH_NEON64 0
+#endif
+
+#define HAVE_NEON64 BASE64_WITH_NEON64
+
+#endif // BASE64_CONFIG_H
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/env.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/env.h
@@ -0,0 +1,84 @@
+#ifndef BASE64_ENV_H
+#define BASE64_ENV_H
+
+#include <stdint.h>
+
+// This header file contains macro definitions that describe certain aspects of
+// the compile-time environment. Compatibility and portability macros go here.
+
+// Define machine endianness. This is for GCC:
+#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
+#  define BASE64_LITTLE_ENDIAN 1
+#else
+#  define BASE64_LITTLE_ENDIAN 0
+#endif
+
+// This is for Clang:
+#ifdef __LITTLE_ENDIAN__
+#  define BASE64_LITTLE_ENDIAN 1
+#endif
+
+#ifdef __BIG_ENDIAN__
+#  define BASE64_LITTLE_ENDIAN 0
+#endif
+
+// MSVC++ needs intrin.h for _byteswap_uint64 (issue #68):
+#if BASE64_LITTLE_ENDIAN && defined(_MSC_VER)
+#  include <intrin.h>
+#endif
+
+// Endian conversion functions:
+#if BASE64_LITTLE_ENDIAN
+#  ifdef _MSC_VER
+//   Microsoft Visual C++:
+#    define BASE64_HTOBE32(x)	_byteswap_ulong(x)
+#    define BASE64_HTOBE64(x)	_byteswap_uint64(x)
+#  else
+//   GCC and Clang:
+#    define BASE64_HTOBE32(x)	__builtin_bswap32(x)
+#    define BASE64_HTOBE64(x)	__builtin_bswap64(x)
+#  endif
+#else
+// No conversion needed:
+#  define BASE64_HTOBE32(x)	(x)
+#  define BASE64_HTOBE64(x)	(x)
+#endif
+
+// Detect word size:
+#if defined (__x86_64__)
+// This also works for the x32 ABI, which has a 64-bit word size.
+#  define BASE64_WORDSIZE 64
+#elif SIZE_MAX == UINT32_MAX
+#  define BASE64_WORDSIZE 32
+#elif SIZE_MAX == UINT64_MAX
+#  define BASE64_WORDSIZE 64
+#else
+#  error BASE64_WORDSIZE_NOT_DEFINED
+#endif
+
+// End-of-file definitions.
+// Almost end-of-file when waiting for the last '=' character:
+#define BASE64_AEOF 1
+// End-of-file when stream end has been reached or invalid input provided:
+#define BASE64_EOF 2
+
+// GCC 7 defaults to issuing a warning for fallthrough in switch statements,
+// unless the fallthrough cases are marked with an attribute. As we use
+// fallthrough deliberately, define an alias for the attribute:
+#if __GNUC__ >= 7
+#  define BASE64_FALLTHROUGH  __attribute__((fallthrough));
+#else
+#  define BASE64_FALLTHROUGH
+#endif
+
+// Declare macros to ensure that functions that are intended to be inlined, are
+// actually inlined, even when no optimization is applied. A lot of inner loop
+// code is factored into separate functions for reasons of readability, but
+// that code should always be inlined (and optimized) in the main loop.
+#ifdef _MSC_VER
+#  define BASE64_FORCE_INLINE	__forceinline
+#else
+#  define BASE64_FORCE_INLINE  inline __attribute__((always_inline))
+#endif
+
+#endif	// BASE64_ENV_H
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/lib.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/lib.c
@@ -0,0 +1,164 @@
+#include <stdint.h>
+#include <stddef.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libbase64.h"
+#include "tables/tables.h"
+#include "codecs.h"
+#include "env.h"
+
+// These static function pointers are initialized once when the library is
+// first used, and remain in use for the remaining lifetime of the program.
+// The idea being that CPU features don't change at runtime.
+static struct codec codec = { NULL, NULL };
+
+void
+base64_stream_encode_init (struct base64_state *state, int flags)
+{
+	// If any of the codec flags are set, redo choice:
+	if (codec.enc == NULL || flags & 0xFF) {
+		codec_choose(&codec, flags);
+	}
+	state->eof = 0;
+	state->bytes = 0;
+	state->carry = 0;
+	state->flags = flags;
+}
+
+void
+base64_stream_encode
+	( struct base64_state	*state
+	, const char		*src
+	, size_t		 srclen
+	, char			*out
+	, size_t		*outlen
+	)
+{
+	codec.enc(state, src, srclen, out, outlen);
+}
+
+void
+base64_stream_encode_final
+	( struct base64_state	*state
+	, char			*out
+	, size_t		*outlen
+	)
+{
+	uint8_t *o = (uint8_t *)out;
+
+	if (state->bytes == 1) {
+		*o++ = base64_table_enc_6bit[state->carry];
+		*o++ = '=';
+		*o++ = '=';
+		*outlen = 3;
+		return;
+	}
+	if (state->bytes == 2) {
+		*o++ = base64_table_enc_6bit[state->carry];
+		*o++ = '=';
+		*outlen = 2;
+		return;
+	}
+	*outlen = 0;
+}
+
+void
+base64_stream_decode_init (struct base64_state *state, int flags)
+{
+	// If any of the codec flags are set, redo choice:
+	if (codec.dec == NULL || flags & 0xFFFF) {
+		codec_choose(&codec, flags);
+	}
+	state->eof = 0;
+	state->bytes = 0;
+	state->carry = 0;
+	state->flags = flags;
+}
+
+int
+base64_stream_decode
+	( struct base64_state	*state
+	, const char		*src
+	, size_t		 srclen
+	, char			*out
+	, size_t		*outlen
+	)
+{
+	return codec.dec(state, src, srclen, out, outlen);
+}
+
+#ifdef _OPENMP
+
+	// Due to the overhead of initializing OpenMP and creating a team of
+	// threads, we require the data length to be larger than a threshold:
+	#define OMP_THRESHOLD 20000
+
+	// Conditionally include OpenMP-accelerated codec implementations:
+	#include "lib_openmp.c"
+#endif
+
+void
+base64_encode
+	( const char	*src
+	, size_t	 srclen
+	, char		*out
+	, size_t	*outlen
+	, int		 flags
+	)
+{
+	size_t s;
+	size_t t;
+	struct base64_state state;
+
+	#ifdef _OPENMP
+	if (srclen >= OMP_THRESHOLD) {
+		base64_encode_openmp(src, srclen, out, outlen, flags);
+		return;
+	}
+	#endif
+
+	// Init the stream reader:
+	base64_stream_encode_init(&state, flags);
+
+	// Feed the whole string to the stream reader:
+	base64_stream_encode(&state, src, srclen, out, &s);
+
+	// Finalize the stream by writing trailer if any:
+	base64_stream_encode_final(&state, out + s, &t);
+
+	// Final output length is stream length plus tail:
+	*outlen = s + t;
+}
+
+int
+base64_decode
+	( const char	*src
+	, size_t	 srclen
+	, char		*out
+	, size_t	*outlen
+	, int		 flags
+	)
+{
+	int ret;
+	struct base64_state state;
+
+	#ifdef _OPENMP
+	if (srclen >= OMP_THRESHOLD) {
+		return base64_decode_openmp(src, srclen, out, outlen, flags);
+	}
+	#endif
+
+	// Init the stream reader:
+	base64_stream_decode_init(&state, flags);
+
+	// Feed the whole string to the stream reader:
+	ret = base64_stream_decode(&state, src, srclen, out, outlen);
+
+	// If when decoding a whole block, we're still waiting for input then fail:
+	if (ret && (state.bytes == 0)) {
+		return ret;
+	}
+	return 0;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/libbase64.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/libbase64.h
@@ -0,0 +1,146 @@
+#ifndef LIBBASE64_H
+#define LIBBASE64_H
+
+#include <stddef.h>	/* size_t */
+
+
+#if defined(_WIN32) || defined(__CYGWIN__)
+#define BASE64_SYMBOL_IMPORT __declspec(dllimport)
+#define BASE64_SYMBOL_EXPORT __declspec(dllexport)
+#define BASE64_SYMBOL_PRIVATE
+
+#elif __GNUC__ >= 4
+#define BASE64_SYMBOL_IMPORT   __attribute__ ((visibility ("default")))
+#define BASE64_SYMBOL_EXPORT   __attribute__ ((visibility ("default")))
+#define BASE64_SYMBOL_PRIVATE  __attribute__ ((visibility ("hidden")))
+
+#else
+#define BASE64_SYMBOL_IMPORT
+#define BASE64_SYMBOL_EXPORT
+#define BASE64_SYMBOL_PRIVATE
+#endif
+
+#if defined(BASE64_STATIC_DEFINE)
+#define BASE64_EXPORT
+#define BASE64_NO_EXPORT
+
+#else
+#if defined(BASE64_EXPORTS) // defined if we are building the shared library
+#define BASE64_EXPORT BASE64_SYMBOL_EXPORT
+
+#else
+#define BASE64_EXPORT BASE64_SYMBOL_IMPORT
+#endif
+
+#define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* These are the flags that can be passed in the `flags` argument. The values
+ * below force the use of a given codec, even if that codec is a no-op in the
+ * current build. Used in testing. Set to 0 for the default behavior, which is
+ * runtime feature detection on x86, a compile-time fixed codec on ARM, and
+ * the plain codec on other platforms: */
+#define BASE64_FORCE_AVX2	(1 << 0)
+#define BASE64_FORCE_NEON32	(1 << 1)
+#define BASE64_FORCE_NEON64	(1 << 2)
+#define BASE64_FORCE_PLAIN	(1 << 3)
+#define BASE64_FORCE_SSSE3	(1 << 4)
+#define BASE64_FORCE_SSE41	(1 << 5)
+#define BASE64_FORCE_SSE42	(1 << 6)
+#define BASE64_FORCE_AVX	(1 << 7)
+#define BASE64_FORCE_AVX512	(1 << 8)
+
+struct base64_state {
+	int eof;
+	int bytes;
+	int flags;
+	unsigned char carry;
+};
+
+/* Wrapper function to encode a plain string of given length. Output is written
+ * to *out without trailing zero. Output length in bytes is written to *outlen.
+ * The buffer in `out` has been allocated by the caller and is at least 4/3 the
+ * size of the input. See above for `flags`; set to 0 for default operation: */
+void BASE64_EXPORT base64_encode
+	( const char		*src
+	, size_t		 srclen
+	, char			*out
+	, size_t		*outlen
+	, int			 flags
+	) ;
+
+/* Call this before calling base64_stream_encode() to init the state. See above
+ * for `flags`; set to 0 for default operation: */
+void BASE64_EXPORT base64_stream_encode_init
+	( struct base64_state	*state
+	, int			 flags
+	) ;
+
+/* Encodes the block of data of given length at `src`, into the buffer at
+ * `out`. Caller is responsible for allocating a large enough out-buffer; it
+ * must be at least 4/3 the size of the in-buffer, but take some margin. Places
+ * the number of new bytes written into `outlen` (which is set to zero when the
+ * function starts). Does not zero-terminate or finalize the output. */
+void BASE64_EXPORT base64_stream_encode
+	( struct base64_state	*state
+	, const char		*src
+	, size_t		 srclen
+	, char			*out
+	, size_t		*outlen
+	) ;
+
+/* Finalizes the output begun by previous calls to `base64_stream_encode()`.
+ * Adds the required end-of-stream markers if appropriate. `outlen` is modified
+ * and will contain the number of new bytes written at `out` (which will quite
+ * often be zero). */
+void BASE64_EXPORT base64_stream_encode_final
+	( struct base64_state	*state
+	, char			*out
+	, size_t		*outlen
+	) ;
+
+/* Wrapper function to decode a plain string of given length. Output is written
+ * to *out without trailing zero. Output length in bytes is written to *outlen.
+ * The buffer in `out` has been allocated by the caller and is at least 3/4 the
+ * size of the input. See above for `flags`, set to 0 for default operation: */
+int BASE64_EXPORT base64_decode
+	( const char		*src
+	, size_t		 srclen
+	, char			*out
+	, size_t		*outlen
+	, int			 flags
+	) ;
+
+/* Call this before calling base64_stream_decode() to init the state. See above
+ * for `flags`; set to 0 for default operation: */
+void BASE64_EXPORT base64_stream_decode_init
+	( struct base64_state	*state
+	, int			 flags
+	) ;
+
+/* Decodes the block of data of given length at `src`, into the buffer at
+ * `out`. Caller is responsible for allocating a large enough out-buffer; it
+ * must be at least 3/4 the size of the in-buffer, but take some margin. Places
+ * the number of new bytes written into `outlen` (which is set to zero when the
+ * function starts). Does not zero-terminate the output. Returns 1 if all is
+ * well, and 0 if a decoding error was found, such as an invalid character.
+ * Returns -1 if the chosen codec is not included in the current build. Used by
+ * the test harness to check whether a codec is available for testing. */
+int BASE64_EXPORT base64_stream_decode
+	( struct base64_state	*state
+	, const char		*src
+	, size_t		 srclen
+	, char			*out
+	, size_t		*outlen
+	) ;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* LIBBASE64_H */
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/tables/table_dec_32bit.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/tables/table_dec_32bit.h
@@ -0,0 +1,393 @@
+#include <stdint.h>
+#define CHAR62 '+'
+#define CHAR63 '/'
+#define CHARPAD '='
+
+
+#if BASE64_LITTLE_ENDIAN
+
+
+/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN (INTEL) CPUS */
+
+const uint32_t base64_table_dec_32bit_d0[256] = {
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x000000f8, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000fc,
+0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
+0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
+0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
+0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
+0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
+0x00000064, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
+0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
+0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
+0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
+0x000000c4, 0x000000c8, 0x000000cc, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+
+const uint32_t base64_table_dec_32bit_d1[256] = {
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x0000e003, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000f003,
+0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
+0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
+0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
+0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
+0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
+0x00009001, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
+0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
+0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
+0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
+0x00001003, 0x00002003, 0x00003003, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+
+const uint32_t base64_table_dec_32bit_d2[256] = {
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x00800f00, 0xffffffff, 0xffffffff, 0xffffffff, 0x00c00f00,
+0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
+0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
+0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
+0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
+0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
+0x00400600, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
+0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
+0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
+0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
+0x00400c00, 0x00800c00, 0x00c00c00, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+
+const uint32_t base64_table_dec_32bit_d3[256] = {
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x003e0000, 0xffffffff, 0xffffffff, 0xffffffff, 0x003f0000,
+0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
+0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
+0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
+0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
+0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
+0x00190000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
+0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
+0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
+0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
+0x00310000, 0x00320000, 0x00330000, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+
+#else
+
+
+/* SPECIAL DECODE TABLES FOR BIG ENDIAN (IBM/MOTOROLA/SUN) CPUS */
+
+const uint32_t base64_table_dec_32bit_d0[256] = {
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xf8000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xfc000000,
+0xd0000000, 0xd4000000, 0xd8000000, 0xdc000000, 0xe0000000, 0xe4000000,
+0xe8000000, 0xec000000, 0xf0000000, 0xf4000000, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+0x04000000, 0x08000000, 0x0c000000, 0x10000000, 0x14000000, 0x18000000,
+0x1c000000, 0x20000000, 0x24000000, 0x28000000, 0x2c000000, 0x30000000,
+0x34000000, 0x38000000, 0x3c000000, 0x40000000, 0x44000000, 0x48000000,
+0x4c000000, 0x50000000, 0x54000000, 0x58000000, 0x5c000000, 0x60000000,
+0x64000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x68000000, 0x6c000000, 0x70000000, 0x74000000, 0x78000000,
+0x7c000000, 0x80000000, 0x84000000, 0x88000000, 0x8c000000, 0x90000000,
+0x94000000, 0x98000000, 0x9c000000, 0xa0000000, 0xa4000000, 0xa8000000,
+0xac000000, 0xb0000000, 0xb4000000, 0xb8000000, 0xbc000000, 0xc0000000,
+0xc4000000, 0xc8000000, 0xcc000000, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+
+const uint32_t base64_table_dec_32bit_d1[256] = {
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x03e00000, 0xffffffff, 0xffffffff, 0xffffffff, 0x03f00000,
+0x03400000, 0x03500000, 0x03600000, 0x03700000, 0x03800000, 0x03900000,
+0x03a00000, 0x03b00000, 0x03c00000, 0x03d00000, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+0x00100000, 0x00200000, 0x00300000, 0x00400000, 0x00500000, 0x00600000,
+0x00700000, 0x00800000, 0x00900000, 0x00a00000, 0x00b00000, 0x00c00000,
+0x00d00000, 0x00e00000, 0x00f00000, 0x01000000, 0x01100000, 0x01200000,
+0x01300000, 0x01400000, 0x01500000, 0x01600000, 0x01700000, 0x01800000,
+0x01900000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x01a00000, 0x01b00000, 0x01c00000, 0x01d00000, 0x01e00000,
+0x01f00000, 0x02000000, 0x02100000, 0x02200000, 0x02300000, 0x02400000,
+0x02500000, 0x02600000, 0x02700000, 0x02800000, 0x02900000, 0x02a00000,
+0x02b00000, 0x02c00000, 0x02d00000, 0x02e00000, 0x02f00000, 0x03000000,
+0x03100000, 0x03200000, 0x03300000, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+
+const uint32_t base64_table_dec_32bit_d2[256] = {
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x000f8000, 0xffffffff, 0xffffffff, 0xffffffff, 0x000fc000,
+0x000d0000, 0x000d4000, 0x000d8000, 0x000dc000, 0x000e0000, 0x000e4000,
+0x000e8000, 0x000ec000, 0x000f0000, 0x000f4000, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+0x00004000, 0x00008000, 0x0000c000, 0x00010000, 0x00014000, 0x00018000,
+0x0001c000, 0x00020000, 0x00024000, 0x00028000, 0x0002c000, 0x00030000,
+0x00034000, 0x00038000, 0x0003c000, 0x00040000, 0x00044000, 0x00048000,
+0x0004c000, 0x00050000, 0x00054000, 0x00058000, 0x0005c000, 0x00060000,
+0x00064000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x00068000, 0x0006c000, 0x00070000, 0x00074000, 0x00078000,
+0x0007c000, 0x00080000, 0x00084000, 0x00088000, 0x0008c000, 0x00090000,
+0x00094000, 0x00098000, 0x0009c000, 0x000a0000, 0x000a4000, 0x000a8000,
+0x000ac000, 0x000b0000, 0x000b4000, 0x000b8000, 0x000bc000, 0x000c0000,
+0x000c4000, 0x000c8000, 0x000cc000, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+
+const uint32_t base64_table_dec_32bit_d3[256] = {
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x00003e00, 0xffffffff, 0xffffffff, 0xffffffff, 0x00003f00,
+0x00003400, 0x00003500, 0x00003600, 0x00003700, 0x00003800, 0x00003900,
+0x00003a00, 0x00003b00, 0x00003c00, 0x00003d00, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
+0x00000100, 0x00000200, 0x00000300, 0x00000400, 0x00000500, 0x00000600,
+0x00000700, 0x00000800, 0x00000900, 0x00000a00, 0x00000b00, 0x00000c00,
+0x00000d00, 0x00000e00, 0x00000f00, 0x00001000, 0x00001100, 0x00001200,
+0x00001300, 0x00001400, 0x00001500, 0x00001600, 0x00001700, 0x00001800,
+0x00001900, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0x00001a00, 0x00001b00, 0x00001c00, 0x00001d00, 0x00001e00,
+0x00001f00, 0x00002000, 0x00002100, 0x00002200, 0x00002300, 0x00002400,
+0x00002500, 0x00002600, 0x00002700, 0x00002800, 0x00002900, 0x00002a00,
+0x00002b00, 0x00002c00, 0x00002d00, 0x00002e00, 0x00002f00, 0x00003000,
+0x00003100, 0x00003200, 0x00003300, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
+0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
+};
+
+
+#endif
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/tables/table_enc_12bit.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/tables/table_enc_12bit.h
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/tables/tables.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/tables/tables.c
@@ -0,0 +1,40 @@
+#include "tables.h"
+
+const uint8_t
+base64_table_enc_6bit[] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+	"abcdefghijklmnopqrstuvwxyz"
+	"0123456789"
+	"+/";
+
+// In the lookup table below, note that the value for '=' (character 61) is
+// 254, not 255. This character is used for in-band signaling of the end of
+// the datastream, and we will use that later. The characters A-Z, a-z, 0-9
+// and + / are mapped to their "decoded" values. The other bytes all map to
+// the value 255, which flags them as "invalid input".
+
+const uint8_t
+base64_table_dec_8bit[] =
+{
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		//   0..15
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		//  16..31
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,  62, 255, 255, 255,  63,		//  32..47
+	 52,  53,  54,  55,  56,  57,  58,  59,  60,  61, 255, 255, 255, 254, 255, 255,		//  48..63
+	255,   0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,		//  64..79
+	 15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25, 255, 255, 255, 255, 255,		//  80..95
+	255,  26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,		//  96..111
+	 41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51, 255, 255, 255, 255, 255,		// 112..127
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,		// 128..143
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+	255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
+};
+
+#if BASE64_WORDSIZE >= 32
+#  include "table_dec_32bit.h"
+#  include "table_enc_12bit.h"
+#endif
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/tables/tables.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/base64/tables/tables.h
@@ -0,0 +1,23 @@
+#ifndef BASE64_TABLES_H
+#define BASE64_TABLES_H
+
+#include <stdint.h>
+
+#include "../env.h"
+
+// These tables are used by all codecs for fallback plain encoding/decoding:
+extern const uint8_t base64_table_enc_6bit[];
+extern const uint8_t base64_table_dec_8bit[];
+
+// These tables are used for the 32-bit and 64-bit generic decoders:
+#if BASE64_WORDSIZE >= 32
+extern const uint32_t base64_table_dec_32bit_d0[];
+extern const uint32_t base64_table_dec_32bit_d1[];
+extern const uint32_t base64_table_dec_32bit_d2[];
+extern const uint32_t base64_table_dec_32bit_d3[];
+
+// This table is used by the 32 and 64-bit generic encoders:
+extern const uint16_t base64_table_enc_12bit[];
+#endif
+
+#endif	// BASE64_TABLES_H
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/bytes_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/bytes_ops.c
@@ -0,0 +1,164 @@
+// Bytes primitive operations
+//
+// These are registered in mypyc.primitives.bytes_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+// Returns -1 on error, 0 on inequality, 1 on equality.
+//
+// Falls back to PyObject_RichCompareBool.
+int CPyBytes_Compare(PyObject *left, PyObject *right) {
+    if (PyBytes_CheckExact(left) && PyBytes_CheckExact(right)) {
+        if (left == right) {
+            return 1;
+        }
+
+        // Adapted from cpython internal implementation of bytes_compare.
+        Py_ssize_t len = Py_SIZE(left);
+        if (Py_SIZE(right) != len) {
+            return 0;
+        }
+        PyBytesObject *left_b = (PyBytesObject *)left;
+        PyBytesObject *right_b = (PyBytesObject *)right;
+        if (left_b->ob_sval[0] != right_b->ob_sval[0]) {
+            return 0;
+        }
+
+        return memcmp(left_b->ob_sval, right_b->ob_sval, len) == 0;
+    }
+    return PyObject_RichCompareBool(left, right, Py_EQ);
+}
+
+CPyTagged CPyBytes_GetItem(PyObject *o, CPyTagged index) {
+    if (CPyTagged_CheckShort(index)) {
+        Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+        Py_ssize_t size = ((PyVarObject *)o)->ob_size;
+        if (n < 0)
+            n += size;
+        if (n < 0 || n >= size) {
+            PyErr_SetString(PyExc_IndexError, "index out of range");
+            return CPY_INT_TAG;
+        }
+        unsigned char num = PyBytes_Check(o) ? ((PyBytesObject *)o)->ob_sval[n]
+                                             : ((PyByteArrayObject *)o)->ob_bytes[n];
+        return num << 1;
+    } else {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return CPY_INT_TAG;
+    }
+}
+
+PyObject *CPyBytes_Concat(PyObject *a, PyObject *b) {
+    if (PyBytes_Check(a) && PyBytes_Check(b)) {
+        Py_ssize_t a_len = ((PyVarObject *)a)->ob_size;
+        Py_ssize_t b_len = ((PyVarObject *)b)->ob_size;
+        PyBytesObject *ret = (PyBytesObject *)PyBytes_FromStringAndSize(NULL, a_len + b_len);
+        if (ret != NULL) {
+            memcpy(ret->ob_sval, ((PyBytesObject *)a)->ob_sval, a_len);
+            memcpy(ret->ob_sval + a_len, ((PyBytesObject *)b)->ob_sval, b_len);
+        }
+        return (PyObject *)ret;
+    } else if (PyByteArray_Check(a)) {
+        return PyByteArray_Concat(a, b);
+    } else {
+        PyBytes_Concat(&a, b);
+        return a;
+    }
+}
+
+static inline Py_ssize_t Clamp(Py_ssize_t a, Py_ssize_t b, Py_ssize_t c) {
+    return a < b ? b : (a >= c ? c : a);
+}
+
+PyObject *CPyBytes_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
+    if ((PyBytes_Check(obj) || PyByteArray_Check(obj))
+            && CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end)) {
+        Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
+        Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
+        Py_ssize_t len = ((PyVarObject *)obj)->ob_size;
+        if (startn < 0) {
+            startn += len;
+        }
+        if (endn < 0) {
+            endn += len;
+        }
+        startn = Clamp(startn, 0, len);
+        endn = Clamp(endn, 0, len);
+        Py_ssize_t slice_len = endn - startn;
+        if (PyBytes_Check(obj)) {
+            return PyBytes_FromStringAndSize(PyBytes_AS_STRING(obj) + startn, slice_len);
+        } else {
+            return PyByteArray_FromStringAndSize(PyByteArray_AS_STRING(obj) + startn, slice_len);
+        }
+    }
+    return CPyObject_GetSlice(obj, start, end);
+}
+
+// Like _PyBytes_Join but fallback to dynamic call if 'sep' is not bytes
+// (mostly commonly, for bytearrays)
+PyObject *CPyBytes_Join(PyObject *sep, PyObject *iter) {
+    if (PyBytes_CheckExact(sep)) {
+        return PyBytes_Join(sep, iter);
+    } else {
+        _Py_IDENTIFIER(join);
+        PyObject *name = _PyUnicode_FromId(&PyId_join); /* borrowed */
+        if (name == NULL) {
+            return NULL;
+        }
+        return PyObject_CallMethodOneArg(sep, name, iter);
+    }
+}
+
+PyObject *CPyBytes_Build(Py_ssize_t len, ...) {
+    Py_ssize_t i;
+    Py_ssize_t sz = 0;
+
+    va_list args;
+    va_start(args, len);
+    for (i = 0; i < len; i++) {
+        PyObject *item = va_arg(args, PyObject *);
+        size_t add_sz = ((PyVarObject *)item)->ob_size;
+        // Using size_t to avoid overflow during arithmetic calculation
+        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "join() result is too long for a Python bytes");
+            return NULL;
+        }
+        sz += add_sz;
+    }
+    va_end(args);
+
+    PyBytesObject *ret = (PyBytesObject *)PyBytes_FromStringAndSize(NULL, sz);
+    if (ret != NULL) {
+        char *res_data = ret->ob_sval;
+        va_start(args, len);
+        for (i = 0; i < len; i++) {
+            PyObject *item = va_arg(args, PyObject *);
+            Py_ssize_t item_sz = ((PyVarObject *)item)->ob_size;
+            memcpy(res_data, ((PyBytesObject *)item)->ob_sval, item_sz);
+            res_data += item_sz;
+        }
+        va_end(args);
+        assert(res_data == ret->ob_sval + ((PyVarObject *)ret)->ob_size);
+    }
+
+    return (PyObject *)ret;
+}
+
+
+CPyTagged CPyBytes_Ord(PyObject *obj) {
+    if (PyBytes_Check(obj)) {
+        Py_ssize_t s = PyBytes_GET_SIZE(obj);
+        if (s == 1) {
+            return (unsigned char)(PyBytes_AS_STRING(obj)[0]) << 1;
+        }
+    } else if (PyByteArray_Check(obj)) {
+        Py_ssize_t s = PyByteArray_GET_SIZE(obj);
+        if (s == 1) {
+            return (unsigned char)(PyByteArray_AS_STRING(obj)[0]) << 1;
+        }
+    }
+    PyErr_SetString(PyExc_TypeError, "ord() expects a character");
+    return CPY_INT_TAG;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/dict_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/dict_ops.c
@@ -0,0 +1,491 @@
+// Dict primitive operations
+//
+// These are registered in mypyc.primitives.dict_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+#ifndef Py_TPFLAGS_MAPPING
+#define Py_TPFLAGS_MAPPING (1 << 6)
+#endif
+
+// Dict subclasses like defaultdict override things in interesting
+// ways, so we don't want to just directly use the dict methods. Not
+// sure if it is actually worth doing all this stuff, but it saves
+// some indirections.
+PyObject *CPyDict_GetItem(PyObject *dict, PyObject *key) {
+    if (PyDict_CheckExact(dict)) {
+        PyObject *res = PyDict_GetItemWithError(dict, key);
+        if (!res) {
+            if (!PyErr_Occurred()) {
+                PyErr_SetObject(PyExc_KeyError, key);
+            }
+        } else {
+            Py_INCREF(res);
+        }
+        return res;
+    } else {
+        return PyObject_GetItem(dict, key);
+    }
+}
+
+PyObject *CPyDict_Build(Py_ssize_t size, ...) {
+    Py_ssize_t i;
+
+    PyObject *res = _PyDict_NewPresized(size);
+    if (res == NULL) {
+        return NULL;
+    }
+
+    va_list args;
+    va_start(args, size);
+
+    for (i = 0; i < size; i++) {
+        PyObject *key = va_arg(args, PyObject *);
+        PyObject *value = va_arg(args, PyObject *);
+        if (PyDict_SetItem(res, key, value)) {
+            Py_DECREF(res);
+            return NULL;
+        }
+    }
+
+    va_end(args);
+    return res;
+}
+
+PyObject *CPyDict_Get(PyObject *dict, PyObject *key, PyObject *fallback) {
+    // We are dodgily assuming that get on a subclass doesn't have
+    // different behavior.
+    PyObject *res = PyDict_GetItemWithError(dict, key);
+    if (!res) {
+        if (PyErr_Occurred()) {
+            return NULL;
+        }
+        res = fallback;
+    }
+    Py_INCREF(res);
+    return res;
+}
+
+PyObject *CPyDict_GetWithNone(PyObject *dict, PyObject *key) {
+    return CPyDict_Get(dict, key, Py_None);
+}
+
+PyObject *CPyDict_SetDefault(PyObject *dict, PyObject *key, PyObject *value) {
+    if (PyDict_CheckExact(dict)) {
+        PyObject* ret = PyDict_SetDefault(dict, key, value);
+        Py_XINCREF(ret);
+        return ret;
+    }
+    _Py_IDENTIFIER(setdefault);
+    PyObject *name = _PyUnicode_FromId(&PyId_setdefault); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    return PyObject_CallMethodObjArgs(dict, name, key, value, NULL);
+}
+
+PyObject *CPyDict_SetDefaultWithNone(PyObject *dict, PyObject *key) {
+    return CPyDict_SetDefault(dict, key, Py_None);
+}
+
+PyObject *CPyDict_SetDefaultWithEmptyDatatype(PyObject *dict, PyObject *key,
+                                              int data_type) {
+    PyObject *res = CPyDict_GetItem(dict, key);
+    if (!res) {
+        // CPyDict_GetItem() would generates a PyExc_KeyError
+        // when key is not found.
+        PyErr_Clear();
+
+        PyObject *new_obj;
+        if (data_type == 1) {
+            new_obj = PyList_New(0);
+        } else if (data_type == 2) {
+            new_obj = PyDict_New();
+        } else if (data_type == 3) {
+            new_obj = PySet_New(NULL);
+        } else {
+            return NULL;
+        }
+
+        if (CPyDict_SetItem(dict, key, new_obj) == -1) {
+            return NULL;
+        } else {
+            return new_obj;
+        }
+    } else {
+        return res;
+    }
+}
+
+int CPyDict_SetItem(PyObject *dict, PyObject *key, PyObject *value) {
+    if (PyDict_CheckExact(dict)) {
+        return PyDict_SetItem(dict, key, value);
+    } else {
+        return PyObject_SetItem(dict, key, value);
+    }
+}
+
+static inline int CPy_ObjectToStatus(PyObject *obj) {
+    if (obj) {
+        Py_DECREF(obj);
+        return 0;
+    } else {
+        return -1;
+    }
+}
+
+static int CPyDict_UpdateGeneral(PyObject *dict, PyObject *stuff) {
+    _Py_IDENTIFIER(update);
+    PyObject *name = _PyUnicode_FromId(&PyId_update); /* borrowed */
+    if (name == NULL) {
+        return -1;
+    }
+    PyObject *res = PyObject_CallMethodOneArg(dict, name, stuff);
+    return CPy_ObjectToStatus(res);
+}
+
+int CPyDict_UpdateInDisplay(PyObject *dict, PyObject *stuff) {
+    // from https://github.com/python/cpython/blob/55d035113dfb1bd90495c8571758f504ae8d4802/Python/ceval.c#L2710
+    int ret = PyDict_Update(dict, stuff);
+    if (ret < 0) {
+        if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
+            PyErr_Format(PyExc_TypeError,
+                         "'%.200s' object is not a mapping",
+                         Py_TYPE(stuff)->tp_name);
+        }
+    }
+    return ret;
+}
+
+int CPyDict_Update(PyObject *dict, PyObject *stuff) {
+    if (PyDict_CheckExact(dict)) {
+        return PyDict_Update(dict, stuff);
+    } else {
+        return CPyDict_UpdateGeneral(dict, stuff);
+    }
+}
+
+int CPyDict_UpdateFromAny(PyObject *dict, PyObject *stuff) {
+    if (PyDict_CheckExact(dict)) {
+        // Argh this sucks
+        _Py_IDENTIFIER(keys);
+        if (PyDict_Check(stuff) || _CPyObject_HasAttrId(stuff, &PyId_keys)) {
+            return PyDict_Update(dict, stuff);
+        } else {
+            return PyDict_MergeFromSeq2(dict, stuff, 1);
+        }
+    } else {
+        return CPyDict_UpdateGeneral(dict, stuff);
+    }
+}
+
+PyObject *CPyDict_FromAny(PyObject *obj) {
+    if (PyDict_Check(obj)) {
+        return PyDict_Copy(obj);
+    } else {
+        int res;
+        PyObject *dict = PyDict_New();
+        if (!dict) {
+            return NULL;
+        }
+        _Py_IDENTIFIER(keys);
+        if (_CPyObject_HasAttrId(obj, &PyId_keys)) {
+            res = PyDict_Update(dict, obj);
+        } else {
+            res = PyDict_MergeFromSeq2(dict, obj, 1);
+        }
+        if (res < 0) {
+            Py_DECREF(dict);
+            return NULL;
+        }
+        return dict;
+    }
+}
+
+PyObject *CPyDict_KeysView(PyObject *dict) {
+    if (PyDict_CheckExact(dict)){
+        return _CPyDictView_New(dict, &PyDictKeys_Type);
+    }
+    _Py_IDENTIFIER(keys);
+    PyObject *name = _PyUnicode_FromId(&PyId_keys); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    return PyObject_CallMethodNoArgs(dict, name);
+}
+
+PyObject *CPyDict_ValuesView(PyObject *dict) {
+    if (PyDict_CheckExact(dict)){
+        return _CPyDictView_New(dict, &PyDictValues_Type);
+    }
+    _Py_IDENTIFIER(values);
+    PyObject *name = _PyUnicode_FromId(&PyId_values); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    return PyObject_CallMethodNoArgs(dict, name);
+}
+
+PyObject *CPyDict_ItemsView(PyObject *dict) {
+    if (PyDict_CheckExact(dict)){
+        return _CPyDictView_New(dict, &PyDictItems_Type);
+    }
+    _Py_IDENTIFIER(items);
+    PyObject *name = _PyUnicode_FromId(&PyId_items); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    return PyObject_CallMethodNoArgs(dict, name);
+}
+
+PyObject *CPyDict_Keys(PyObject *dict) {
+    if (PyDict_CheckExact(dict)) {
+        return PyDict_Keys(dict);
+    }
+    // Inline generic fallback logic to also return a list.
+    PyObject *list = PyList_New(0);
+    _Py_IDENTIFIER(keys);
+    PyObject *name = _PyUnicode_FromId(&PyId_keys); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    PyObject *view = PyObject_CallMethodNoArgs(dict, name);
+    if (view == NULL) {
+        return NULL;
+    }
+    int res = PyList_Extend(list, view);
+    Py_DECREF(view);
+    if (res < 0) {
+        return NULL;
+    }
+    return list;
+}
+
+PyObject *CPyDict_Values(PyObject *dict) {
+    if (PyDict_CheckExact(dict)) {
+        return PyDict_Values(dict);
+    }
+    // Inline generic fallback logic to also return a list.
+    PyObject *list = PyList_New(0);
+    _Py_IDENTIFIER(values);
+    PyObject *name = _PyUnicode_FromId(&PyId_values); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    PyObject *view = PyObject_CallMethodNoArgs(dict, name);
+    if (view == NULL) {
+        return NULL;
+    }
+    int res = PyList_Extend(list, view);
+    Py_DECREF(view);
+    if (res < 0) {
+        return NULL;
+    }
+    return list;
+}
+
+PyObject *CPyDict_Items(PyObject *dict) {
+    if (PyDict_CheckExact(dict)) {
+        return PyDict_Items(dict);
+    }
+    // Inline generic fallback logic to also return a list.
+    PyObject *list = PyList_New(0);
+    _Py_IDENTIFIER(items);
+    PyObject *name = _PyUnicode_FromId(&PyId_items); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    PyObject *view = PyObject_CallMethodNoArgs(dict, name);
+    if (view == NULL) {
+        return NULL;
+    }
+    int res = PyList_Extend(list, view);
+    Py_DECREF(view);
+    if (res < 0) {
+        return NULL;
+    }
+    return list;
+}
+
+char CPyDict_Clear(PyObject *dict) {
+    if (PyDict_CheckExact(dict)) {
+        PyDict_Clear(dict);
+    } else {
+        _Py_IDENTIFIER(clear);
+        PyObject *name = _PyUnicode_FromId(&PyId_clear); /* borrowed */
+        if (name == NULL) {
+            return 0;
+        }
+        PyObject *res = PyObject_CallMethodNoArgs(dict, name);
+        if (res == NULL) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+PyObject *CPyDict_Copy(PyObject *dict) {
+    if (PyDict_CheckExact(dict)) {
+        return PyDict_Copy(dict);
+    }
+    _Py_IDENTIFIER(copy);
+    PyObject *name = _PyUnicode_FromId(&PyId_copy); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    return PyObject_CallMethodNoArgs(dict, name);
+}
+
+PyObject *CPyDict_GetKeysIter(PyObject *dict) {
+    if (PyDict_CheckExact(dict)) {
+        // Return dict itself to indicate we can use fast path instead.
+        Py_INCREF(dict);
+        return dict;
+    }
+    return PyObject_GetIter(dict);
+}
+
+PyObject *CPyDict_GetItemsIter(PyObject *dict) {
+    if (PyDict_CheckExact(dict)) {
+        // Return dict itself to indicate we can use fast path instead.
+        Py_INCREF(dict);
+        return dict;
+    }
+    _Py_IDENTIFIER(items);
+    PyObject *name = _PyUnicode_FromId(&PyId_items); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    PyObject *view = PyObject_CallMethodNoArgs(dict, name);
+    if (view == NULL) {
+        return NULL;
+    }
+    PyObject *iter = PyObject_GetIter(view);
+    Py_DECREF(view);
+    return iter;
+}
+
+PyObject *CPyDict_GetValuesIter(PyObject *dict) {
+    if (PyDict_CheckExact(dict)) {
+        // Return dict itself to indicate we can use fast path instead.
+        Py_INCREF(dict);
+        return dict;
+    }
+    _Py_IDENTIFIER(values);
+    PyObject *name = _PyUnicode_FromId(&PyId_values); /* borrowed */
+    if (name == NULL) {
+        return NULL;
+    }
+    PyObject *view = PyObject_CallMethodNoArgs(dict, name);
+    if (view == NULL) {
+        return NULL;
+    }
+    PyObject *iter = PyObject_GetIter(view);
+    Py_DECREF(view);
+    return iter;
+}
+
+static void _CPyDict_FromNext(tuple_T3CIO *ret, PyObject *dict_iter) {
+    // Get next item from iterator and set "should continue" flag.
+    ret->f2 = PyIter_Next(dict_iter);
+    if (ret->f2 == NULL) {
+        ret->f0 = 0;
+        Py_INCREF(Py_None);
+        ret->f2 = Py_None;
+    } else {
+        ret->f0 = 1;
+    }
+}
+
+// Helpers for fast dictionary iteration, return a single tuple
+// instead of writing to multiple registers, for exact dicts use
+// the fast path, and fall back to generic iterator logic for subclasses.
+tuple_T3CIO CPyDict_NextKey(PyObject *dict_or_iter, CPyTagged offset) {
+    tuple_T3CIO ret;
+    Py_ssize_t py_offset = CPyTagged_AsSsize_t(offset);
+    PyObject *dummy;
+
+    if (PyDict_CheckExact(dict_or_iter)) {
+        ret.f0 = PyDict_Next(dict_or_iter, &py_offset, &ret.f2, &dummy);
+        if (ret.f0) {
+            ret.f1 = CPyTagged_FromSsize_t(py_offset);
+        } else {
+            // Set key to None, so mypyc can manage refcounts.
+            ret.f1 = 0;
+            ret.f2 = Py_None;
+        }
+        // PyDict_Next() returns borrowed references.
+        Py_INCREF(ret.f2);
+    } else {
+        // offset is dummy in this case, just use the old value.
+        ret.f1 = offset;
+        _CPyDict_FromNext(&ret, dict_or_iter);
+    }
+    return ret;
+}
+
+tuple_T3CIO CPyDict_NextValue(PyObject *dict_or_iter, CPyTagged offset) {
+    tuple_T3CIO ret;
+    Py_ssize_t py_offset = CPyTagged_AsSsize_t(offset);
+    PyObject *dummy;
+
+    if (PyDict_CheckExact(dict_or_iter)) {
+        ret.f0 = PyDict_Next(dict_or_iter, &py_offset, &dummy, &ret.f2);
+        if (ret.f0) {
+            ret.f1 = CPyTagged_FromSsize_t(py_offset);
+        } else {
+            // Set value to None, so mypyc can manage refcounts.
+            ret.f1 = 0;
+            ret.f2 = Py_None;
+        }
+        // PyDict_Next() returns borrowed references.
+        Py_INCREF(ret.f2);
+    } else {
+        // offset is dummy in this case, just use the old value.
+        ret.f1 = offset;
+        _CPyDict_FromNext(&ret, dict_or_iter);
+    }
+    return ret;
+}
+
+tuple_T4CIOO CPyDict_NextItem(PyObject *dict_or_iter, CPyTagged offset) {
+    tuple_T4CIOO ret;
+    Py_ssize_t py_offset = CPyTagged_AsSsize_t(offset);
+
+    if (PyDict_CheckExact(dict_or_iter)) {
+        ret.f0 = PyDict_Next(dict_or_iter, &py_offset, &ret.f2, &ret.f3);
+        if (ret.f0) {
+            ret.f1 = CPyTagged_FromSsize_t(py_offset);
+        } else {
+            // Set key and value to None, so mypyc can manage refcounts.
+            ret.f1 = 0;
+            ret.f2 = Py_None;
+            ret.f3 = Py_None;
+        }
+    } else {
+        ret.f1 = offset;
+        PyObject *item = PyIter_Next(dict_or_iter);
+        if (item == NULL || !PyTuple_Check(item) || PyTuple_GET_SIZE(item) != 2) {
+            if (item != NULL) {
+                PyErr_SetString(PyExc_TypeError, "a tuple of length 2 expected");
+            }
+            ret.f0 = 0;
+            ret.f2 = Py_None;
+            ret.f3 = Py_None;
+        } else {
+            ret.f0 = 1;
+            ret.f2 = PyTuple_GET_ITEM(item, 0);
+            ret.f3 = PyTuple_GET_ITEM(item, 1);
+            Py_DECREF(item);
+        }
+    }
+    // PyDict_Next() returns borrowed references.
+    Py_INCREF(ret.f2);
+    Py_INCREF(ret.f3);
+    return ret;
+}
+
+int CPyMapping_Check(PyObject *obj) {
+    return Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MAPPING;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/exc_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/exc_ops.c
@@ -0,0 +1,261 @@
+#include "pythoncapi_compat.h"
+
+// Exception related primitive operations
+//
+// These are registered in mypyc.primitives.exc_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+void CPy_Raise(PyObject *exc) {
+    if (PyObject_IsInstance(exc, (PyObject *)&PyType_Type)) {
+        PyObject *obj = PyObject_CallNoArgs(exc);
+        if (!obj)
+            return;
+        PyErr_SetObject(exc, obj);
+        Py_DECREF(obj);
+    } else {
+        PyErr_SetObject((PyObject *)Py_TYPE(exc), exc);
+    }
+}
+
+void CPy_Reraise(void) {
+    PyObject *p_type, *p_value, *p_traceback;
+    PyErr_GetExcInfo(&p_type, &p_value, &p_traceback);
+    PyErr_Restore(p_type, p_value, p_traceback);
+}
+
+void CPyErr_SetObjectAndTraceback(PyObject *type, PyObject *value, PyObject *traceback) {
+    if (!PyType_Check(type) && Py_IsNone(value)) {
+        // The first argument must be an exception instance
+        value = type;
+        type = (PyObject *)Py_TYPE(value);
+    }
+
+    // Set the value and traceback of an error. Because calling
+    // PyErr_Restore takes away a reference to each object passed in
+    // as an argument, we manually increase the reference count of
+    // each argument before calling it.
+    Py_INCREF(type);
+    Py_INCREF(value);
+    Py_INCREF(traceback);
+    PyErr_Restore(type, value, traceback);
+}
+
+tuple_T3OOO CPy_CatchError(void) {
+    // We need to return the existing sys.exc_info() information, so
+    // that it can be restored when we finish handling the error we
+    // are catching now. Grab that triple and convert NULL values to
+    // the ExcDummy object in order to simplify refcount handling in
+    // generated code.
+    tuple_T3OOO ret;
+    PyErr_GetExcInfo(&ret.f0, &ret.f1, &ret.f2);
+    _CPy_ToDummy(&ret.f0);
+    _CPy_ToDummy(&ret.f1);
+    _CPy_ToDummy(&ret.f2);
+
+    if (!PyErr_Occurred()) {
+        PyErr_SetString(PyExc_RuntimeError, "CPy_CatchError called with no error!");
+    }
+
+    // Retrieve the error info and normalize it so that it looks like
+    // what python code needs it to be.
+    PyObject *type, *value, *traceback;
+    PyErr_Fetch(&type, &value, &traceback);
+    // Could we avoid always normalizing?
+    PyErr_NormalizeException(&type, &value, &traceback);
+    if (traceback != NULL) {
+        PyException_SetTraceback(value, traceback);
+    }
+    // Indicate that we are now handling this exception by stashing it
+    // in sys.exc_info().  mypyc routines that need access to the
+    // exception will read it out of there.
+    PyErr_SetExcInfo(type, value, traceback);
+    // Clear the error indicator, since the exception isn't
+    // propagating anymore.
+    PyErr_Clear();
+
+    return ret;
+}
+
+void CPy_RestoreExcInfo(tuple_T3OOO info) {
+    PyErr_SetExcInfo(_CPy_FromDummy(info.f0), _CPy_FromDummy(info.f1), _CPy_FromDummy(info.f2));
+}
+
+bool CPy_ExceptionMatches(PyObject *type) {
+    return PyErr_GivenExceptionMatches((PyObject *)Py_TYPE(CPy_ExcState()->exc_value), type);
+}
+
+PyObject *CPy_GetExcValue(void) {
+    PyObject *exc = CPy_ExcState()->exc_value;
+    Py_INCREF(exc);
+    return exc;
+}
+
+static inline void _CPy_ToNone(PyObject **p) {
+    if (*p == NULL) {
+        Py_INCREF(Py_None);
+        *p = Py_None;
+    }
+}
+
+void _CPy_GetExcInfo(PyObject **p_type, PyObject **p_value, PyObject **p_traceback) {
+    PyErr_GetExcInfo(p_type, p_value, p_traceback);
+    _CPy_ToNone(p_type);
+    _CPy_ToNone(p_value);
+    _CPy_ToNone(p_traceback);
+}
+
+tuple_T3OOO CPy_GetExcInfo(void) {
+    tuple_T3OOO ret;
+    _CPy_GetExcInfo(&ret.f0, &ret.f1, &ret.f2);
+    return ret;
+}
+
+void CPyError_OutOfMemory(void) {
+    fprintf(stderr, "fatal: out of memory\n");
+    fflush(stderr);
+    abort();
+}
+
+// Construct a nicely formatted type name based on __module__ and __name__.
+static PyObject *CPy_GetTypeName(PyObject *type) {
+    PyObject *module = NULL, *name = NULL;
+    PyObject *full = NULL;
+
+    module = PyObject_GetAttrString(type, "__module__");
+    if (!module || !PyUnicode_Check(module)) {
+        goto out;
+    }
+    name = PyObject_GetAttrString(type, "__qualname__");
+    if (!name || !PyUnicode_Check(name)) {
+        goto out;
+    }
+
+    if (PyUnicode_CompareWithASCIIString(module, "builtins") == 0) {
+        Py_INCREF(name);
+        full = name;
+    } else {
+        full = PyUnicode_FromFormat("%U.%U", module, name);
+    }
+
+out:
+    Py_XDECREF(module);
+    Py_XDECREF(name);
+    return full;
+}
+
+// Get the type of a value as a string, expanding tuples to include
+// all the element types.
+static PyObject *CPy_FormatTypeName(PyObject *value) {
+    if (Py_IsNone(value)) {
+        return PyUnicode_FromString("None");
+    }
+
+    if (!PyTuple_CheckExact(value)) {
+        return CPy_GetTypeName((PyObject *)Py_TYPE(value));
+    }
+
+    if (PyTuple_GET_SIZE(value) > 10) {
+        return PyUnicode_FromFormat("tuple[<%d items>]", PyTuple_GET_SIZE(value));
+    }
+
+    // Most of the logic is all for tuples, which is the only interesting case
+    PyObject *output = PyUnicode_FromString("tuple[");
+    if (!output) {
+        return NULL;
+    }
+    /* This is quadratic but if that ever matters something is really weird. */
+    int i;
+    for (i = 0; i < PyTuple_GET_SIZE(value); i++) {
+        PyObject *s = CPy_FormatTypeName(PyTuple_GET_ITEM(value, i));
+        if (!s) {
+            Py_DECREF(output);
+            return NULL;
+        }
+        PyObject *next = PyUnicode_FromFormat("%U%U%s", output, s,
+                                              i + 1 == PyTuple_GET_SIZE(value) ? "]" : ", ");
+        Py_DECREF(output);
+        Py_DECREF(s);
+        if (!next) {
+            return NULL;
+        }
+        output = next;
+    }
+    return output;
+}
+
+CPy_NOINLINE
+void CPy_TypeError(const char *expected, PyObject *value) {
+    PyObject *out = CPy_FormatTypeName(value);
+    if (out) {
+        PyErr_Format(PyExc_TypeError, "%s object expected; got %U", expected, out);
+        Py_DECREF(out);
+    } else {
+        PyErr_Format(PyExc_TypeError, "%s object expected; and errored formatting real type!",
+                     expected);
+    }
+}
+
+// The PyFrameObject type definition (struct _frame) has been moved
+// to the internal C API: to the pycore_frame.h header file.
+// https://github.com/python/cpython/pull/31530
+#if PY_VERSION_HEX >= 0x030b00a6
+#include "internal/pycore_frame.h"
+#endif
+
+// This function is basically exactly the same with _PyTraceback_Add
+// which is available in all the versions we support.
+// We're continuing to use this because we'll probably optimize this later.
+void CPy_AddTraceback(const char *filename, const char *funcname, int line, PyObject *globals) {
+    PyObject *exc, *val, *tb;
+    PyThreadState *thread_state = PyThreadState_GET();
+    PyFrameObject *frame_obj;
+
+    // We need to save off the exception state because in 3.8,
+    // PyFrame_New fails if there is an error set and it fails to look
+    // up builtins in the globals. (_PyTraceback_Add documents that it
+    // needs to do it because it decodes the filename according to the
+    // FS encoding, which could have a decoder in Python. We don't do
+    // that so *that* doesn't apply to us.)
+    PyErr_Fetch(&exc, &val, &tb);
+    PyCodeObject *code_obj = PyCode_NewEmpty(filename, funcname, line);
+    if (code_obj == NULL) {
+        goto error;
+    }
+
+    frame_obj = PyFrame_New(thread_state, code_obj, globals, 0);
+    if (frame_obj == NULL) {
+        Py_DECREF(code_obj);
+        goto error;
+    }
+    frame_obj->f_lineno = line;
+    PyErr_Restore(exc, val, tb);
+    PyTraceBack_Here(frame_obj);
+    Py_DECREF(code_obj);
+    Py_DECREF(frame_obj);
+
+    return;
+
+error:
+#if CPY_3_12_FEATURES
+    _PyErr_ChainExceptions1(exc);
+#else
+    _PyErr_ChainExceptions(exc, val, tb);
+#endif
+}
+
+CPy_NOINLINE
+void CPy_TypeErrorTraceback(const char *filename, const char *funcname, int line,
+                            PyObject *globals, const char *expected, PyObject *value) {
+    CPy_TypeError(expected, value);
+    CPy_AddTraceback(filename, funcname, line, globals);
+}
+
+void CPy_AttributeError(const char *filename, const char *funcname, const char *classname,
+                        const char *attrname, int line, PyObject *globals) {
+    char buf[500];
+    snprintf(buf, sizeof(buf), "attribute '%.200s' of '%.200s' undefined", attrname, classname);
+    PyErr_SetString(PyExc_AttributeError, buf);
+    CPy_AddTraceback(filename, funcname, line, globals);
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/float_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/float_ops.c
@@ -0,0 +1,239 @@
+// Float primitive operations
+//
+// These are registered in mypyc.primitives.float_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+
+static double CPy_DomainError(void) {
+    PyErr_SetString(PyExc_ValueError, "math domain error");
+    return CPY_FLOAT_ERROR;
+}
+
+static double CPy_MathRangeError(void) {
+    PyErr_SetString(PyExc_OverflowError, "math range error");
+    return CPY_FLOAT_ERROR;
+}
+
+static double CPy_MathExpectedNonNegativeInputError(double x) {
+    char *buf = PyOS_double_to_string(x, 'r', 0, Py_DTSF_ADD_DOT_0, NULL);
+    if (buf) {
+        PyErr_Format(PyExc_ValueError, "expected a nonnegative input, got %s", buf);
+        PyMem_Free(buf);
+    }
+    return CPY_FLOAT_ERROR;
+}
+
+static double CPy_MathExpectedPositiveInputError(double x) {
+    char *buf = PyOS_double_to_string(x, 'r', 0, Py_DTSF_ADD_DOT_0, NULL);
+    if (buf) {
+        PyErr_Format(PyExc_ValueError, "expected a positive input, got %s", buf);
+        PyMem_Free(buf);
+    }
+    return CPY_FLOAT_ERROR;
+}
+
+static double CPy_MathExpectedFiniteInput(double x) {
+    char *buf = PyOS_double_to_string(x, 'r', 0, Py_DTSF_ADD_DOT_0, NULL);
+    if (buf) {
+        PyErr_Format(PyExc_ValueError, "expected a finite input, got %s", buf);
+        PyMem_Free(buf);
+    }
+    return CPY_FLOAT_ERROR;
+}
+
+double CPyFloat_FromTagged(CPyTagged x) {
+    if (CPyTagged_CheckShort(x)) {
+        return CPyTagged_ShortAsSsize_t(x);
+    }
+    double result = PyFloat_AsDouble(CPyTagged_LongAsObject(x));
+    if (unlikely(result == -1.0) && PyErr_Occurred()) {
+        return CPY_FLOAT_ERROR;
+    }
+    return result;
+}
+
+double CPyFloat_Sin(double x) {
+    double v = sin(x);
+    if (unlikely(isnan(v)) && !isnan(x)) {
+#if CPY_3_14_FEATURES
+        return CPy_MathExpectedFiniteInput(x);
+#else
+        return CPy_DomainError();
+#endif
+    }
+    return v;
+}
+
+double CPyFloat_Cos(double x) {
+    double v = cos(x);
+    if (unlikely(isnan(v)) && !isnan(x)) {
+#if CPY_3_14_FEATURES
+        return CPy_MathExpectedFiniteInput(x);
+#else
+        return CPy_DomainError();
+#endif
+    }
+    return v;
+}
+
+double CPyFloat_Tan(double x) {
+    if (unlikely(isinf(x))) {
+#if CPY_3_14_FEATURES
+        return CPy_MathExpectedFiniteInput(x);
+#else
+        return CPy_DomainError();
+#endif
+    }
+    return tan(x);
+}
+
+double CPyFloat_Sqrt(double x) {
+    if (x < 0.0) {
+#if CPY_3_14_FEATURES
+        return CPy_MathExpectedNonNegativeInputError(x);
+#else
+        return CPy_DomainError();
+#endif
+    }
+    return sqrt(x);
+}
+
+double CPyFloat_Exp(double x) {
+    double v = exp(x);
+    if (unlikely(v == INFINITY) && x != INFINITY) {
+        return CPy_MathRangeError();
+    }
+    return v;
+}
+
+double CPyFloat_Log(double x) {
+    if (x <= 0.0) {
+#if CPY_3_14_FEATURES
+        return CPy_MathExpectedPositiveInputError(x);
+#else
+        return CPy_DomainError();
+#endif
+    }
+    return log(x);
+}
+
+CPyTagged CPyFloat_Floor(double x) {
+    double v = floor(x);
+    return CPyTagged_FromFloat(v);
+}
+
+CPyTagged CPyFloat_Ceil(double x) {
+    double v = ceil(x);
+    return CPyTagged_FromFloat(v);
+}
+
+bool CPyFloat_IsInf(double x) {
+    return isinf(x) != 0;
+}
+
+bool CPyFloat_IsNaN(double x) {
+    return isnan(x) != 0;
+}
+
+// From CPython 3.10.0, Objects/floatobject.c
+static void
+_float_div_mod(double vx, double wx, double *floordiv, double *mod)
+{
+    double div;
+    *mod = fmod(vx, wx);
+    /* fmod is typically exact, so vx-mod is *mathematically* an
+       exact multiple of wx.  But this is fp arithmetic, and fp
+       vx - mod is an approximation; the result is that div may
+       not be an exact integral value after the division, although
+       it will always be very close to one.
+    */
+    div = (vx - *mod) / wx;
+    if (*mod) {
+        /* ensure the remainder has the same sign as the denominator */
+        if ((wx < 0) != (*mod < 0)) {
+            *mod += wx;
+            div -= 1.0;
+        }
+    }
+    else {
+        /* the remainder is zero, and in the presence of signed zeroes
+           fmod returns different results across platforms; ensure
+           it has the same sign as the denominator. */
+        *mod = copysign(0.0, wx);
+    }
+    /* snap quotient to nearest integral value */
+    if (div) {
+        *floordiv = floor(div);
+        if (div - *floordiv > 0.5) {
+            *floordiv += 1.0;
+        }
+    }
+    else {
+        /* div is zero - get the same sign as the true quotient */
+        *floordiv = copysign(0.0, vx / wx); /* zero w/ sign of vx/wx */
+    }
+}
+
+double CPyFloat_FloorDivide(double x, double y) {
+    double mod, floordiv;
+    if (y == 0) {
+        PyErr_SetString(PyExc_ZeroDivisionError, "float floor division by zero");
+        return CPY_FLOAT_ERROR;
+    }
+    _float_div_mod(x, y, &floordiv, &mod);
+    return floordiv;
+}
+
+// Adapted from CPython 3.10.7
+double CPyFloat_Pow(double x, double y) {
+    if (!isfinite(x) || !isfinite(y)) {
+        if (isnan(x))
+            return y == 0.0 ? 1.0 : x; /* NaN**0 = 1 */
+        else if (isnan(y))
+            return x == 1.0 ? 1.0 : y; /* 1**NaN = 1 */
+        else if (isinf(x)) {
+            int odd_y = isfinite(y) && fmod(fabs(y), 2.0) == 1.0;
+            if (y > 0.0)
+                return odd_y ? x : fabs(x);
+            else if (y == 0.0)
+                return 1.0;
+            else /* y < 0. */
+                return odd_y ? copysign(0.0, x) : 0.0;
+        }
+        else if (isinf(y)) {
+            if (fabs(x) == 1.0)
+                return 1.0;
+            else if (y > 0.0 && fabs(x) > 1.0)
+                return y;
+            else if (y < 0.0 && fabs(x) < 1.0) {
+                #if PY_VERSION_HEX < 0x030B0000
+                if (x == 0.0) { /* 0**-inf: divide-by-zero */
+                    return CPy_DomainError();
+                }
+                #endif
+                return -y; /* result is +inf */
+            } else
+                return 0.0;
+        }
+    }
+    double r = pow(x, y);
+    if (!isfinite(r)) {
+        if (isnan(r)) {
+            return CPy_DomainError();
+        }
+        /*
+           an infinite result here arises either from:
+           (A) (+/-0.)**negative (-> divide-by-zero)
+           (B) overflow of x**y with x and y finite
+        */
+        else if (isinf(r)) {
+            if (x == 0.0)
+                return CPy_DomainError();
+            else
+                return CPy_MathRangeError();
+        }
+    }
+    return r;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/generic_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/generic_ops.c
@@ -0,0 +1,84 @@
+// Generic primitive operations
+//
+// These are registered in mypyc.primitives.generic_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+CPyTagged CPyObject_Hash(PyObject *o) {
+    Py_hash_t h = PyObject_Hash(o);
+    if (h == -1) {
+        return CPY_INT_TAG;
+    } else {
+        // This is tragically annoying. The range of hash values in
+        // 64-bit python covers 64-bits, and our short integers only
+        // cover 63. This means that half the time we are boxing the
+        // result for basically no good reason. To add insult to
+        // injury it is probably about to be immediately unboxed by a
+        // tp_hash wrapper.
+        return CPyTagged_FromSsize_t(h);
+    }
+}
+
+PyObject *CPyObject_GetAttr3(PyObject *v, PyObject *name, PyObject *defl)
+{
+    PyObject *result = PyObject_GetAttr(v, name);
+    if (!result && PyErr_ExceptionMatches(PyExc_AttributeError)) {
+        PyErr_Clear();
+        Py_INCREF(defl);
+        result = defl;
+    }
+    return result;
+}
+
+PyObject *CPyIter_Next(PyObject *iter)
+{
+    return (*Py_TYPE(iter)->tp_iternext)(iter);
+}
+
+PyObject *CPyNumber_Power(PyObject *base, PyObject *index)
+{
+    return PyNumber_Power(base, index, Py_None);
+}
+
+PyObject *CPyNumber_InPlacePower(PyObject *base, PyObject *index)
+{
+    return PyNumber_InPlacePower(base, index, Py_None);
+}
+
+PyObject *CPyObject_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
+    PyObject *start_obj = CPyTagged_AsObject(start);
+    PyObject *end_obj = CPyTagged_AsObject(end);
+    if (unlikely(start_obj == NULL || end_obj == NULL)) {
+        return NULL;
+    }
+    PyObject *slice = PySlice_New(start_obj, end_obj, NULL);
+    Py_DECREF(start_obj);
+    Py_DECREF(end_obj);
+    if (unlikely(slice == NULL)) {
+        return NULL;
+    }
+    PyObject *result = PyObject_GetItem(obj, slice);
+    Py_DECREF(slice);
+    return result;
+}
+
+typedef PyObject *(*SetupFunction)(PyObject *);
+
+PyObject *CPy_SetupObject(PyObject *type) {
+    PyTypeObject *tp = (PyTypeObject *)type;
+    PyMethodDef *def = NULL;
+    for(; tp; tp = tp->tp_base) {
+        def = tp->tp_methods;
+        if (!def || !def->ml_name) {
+            continue;
+        }
+
+        if (!strcmp(def->ml_name, "__internal_mypyc_setup")) {
+            return ((SetupFunction)(void(*)(void))def->ml_meth)(type);
+        }
+    }
+
+    PyErr_SetString(PyExc_RuntimeError, "Internal mypyc error: Unable to find object setup function");
+    return NULL;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/getargs.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/getargs.c
@@ -0,0 +1,451 @@
+/* getargs implementation copied from Python 3.8 and stripped down to only include
+ * the functions we need.
+ * We also add support for required kwonly args and accepting *args / **kwargs.
+ * A good idea would be to also vendor in the Fast versions and get our stuff
+ * working with *that*.
+ * Another probably good idea is to strip out all the formatting stuff we don't need
+ * and then add in custom stuff that we do need.
+ *
+ * DOCUMENTATION OF THE EXTENSIONS:
+ *  - Arguments given after a @ format specify are required keyword-only arguments.
+ *    The | and $ specifiers must both appear before @.
+ *  - If the first character of a format string is %, then the function can support
+ *    *args and **kwargs. On seeing a %, the parser will consume two arguments,
+ *    which should be pointers to variables to store the *args and **kwargs, respectively.
+ *    Either pointer can be NULL, in which case the function doesn't take that
+ *    variety of vararg.
+ *    Unlike most format specifiers, the caller takes ownership of these objects
+ *    and is responsible for decrefing them.
+ *  - All arguments must use the 'O' format.
+ *  - There's minimal error checking of format strings. They are generated
+ *    programmatically and can be assumed valid.
+ */
+
+// These macro definitions are copied from pyport.h in Python 3.9 and later
+// https://bugs.python.org/issue19569
+#if defined(__clang__)
+#define _Py_COMP_DIAG_PUSH _Pragma("clang diagnostic push")
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
+    _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#define _Py_COMP_DIAG_POP _Pragma("clang diagnostic pop")
+#elif defined(__GNUC__) \
+    && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 6))
+#define _Py_COMP_DIAG_PUSH _Pragma("GCC diagnostic push")
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
+    _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define _Py_COMP_DIAG_POP _Pragma("GCC diagnostic pop")
+#elif defined(_MSC_VER)
+#define _Py_COMP_DIAG_PUSH __pragma(warning(push))
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS __pragma(warning(disable: 4996))
+#define _Py_COMP_DIAG_POP __pragma(warning(pop))
+#else
+#define _Py_COMP_DIAG_PUSH
+#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS
+#define _Py_COMP_DIAG_POP
+#endif
+
+#include "Python.h"
+#include "pythonsupport.h"
+
+#include <ctype.h>
+#include <float.h>
+
+#ifndef PyDict_GET_SIZE
+#define PyDict_GET_SIZE(d) PyDict_Size(d)
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int CPyArg_ParseTupleAndKeywords(PyObject *, PyObject *,
+                                 const char *, const char *, const char * const *, ...);
+
+/* Forward */
+static int vgetargskeywords(PyObject *, PyObject *,
+                            const char *, const char *, const char * const *, va_list *);
+static void skipitem(const char **, va_list *);
+
+/* Support for keyword arguments donated by
+   Geoff Philbrick <philbric@delphi.hks.com> */
+
+/* Return false (0) for error, else true. */
+int
+CPyArg_ParseTupleAndKeywords(PyObject *args,
+                             PyObject *keywords,
+                             const char *format,
+                             const char *fname,
+                             const char * const *kwlist, ...)
+{
+    int retval;
+    va_list va;
+
+    va_start(va, kwlist);
+    retval = vgetargskeywords(args, keywords, format, fname, kwlist, &va);
+    va_end(va);
+    return retval;
+}
+
+#define IS_END_OF_FORMAT(c) (c == '\0' || c == ';' || c == ':')
+
+static int
+vgetargskeywords(PyObject *args, PyObject *kwargs, const char *format,
+                 const char *fname, const char * const *kwlist, va_list *p_va)
+{
+    int min = INT_MAX;
+    int max = INT_MAX;
+    int required_kwonly_start = INT_MAX;
+    int has_required_kws = 0;
+    int i, pos, len;
+    int skip = 0;
+    Py_ssize_t nargs, nkwargs;
+    PyObject *current_arg;
+    int bound_pos_args;
+
+    PyObject **p_args = NULL, **p_kwargs = NULL;
+
+    assert(args != NULL && PyTuple_Check(args));
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+    assert(format != NULL);
+    assert(kwlist != NULL);
+    assert(p_va != NULL);
+
+    /* scan kwlist and count the number of positional-only parameters */
+    for (pos = 0; kwlist[pos] && !*kwlist[pos]; pos++) {
+    }
+    /* scan kwlist and get greatest possible nbr of args */
+    for (len = pos; kwlist[len]; len++) {
+#ifdef DEBUG
+        if (!*kwlist[len]) {
+            PyErr_SetString(PyExc_SystemError,
+                            "Empty keyword parameter name");
+            return 0;
+        }
+#endif
+    }
+
+    if (*format == '%') {
+        p_args = va_arg(*p_va, PyObject **);
+        p_kwargs = va_arg(*p_va, PyObject **);
+        format++;
+    }
+
+    nargs = PyTuple_GET_SIZE(args);
+    nkwargs = (kwargs == NULL) ? 0 : PyDict_GET_SIZE(kwargs);
+    if (unlikely(nargs + nkwargs > len && !p_args && !p_kwargs)) {
+        /* Adding "keyword" (when nargs == 0) prevents producing wrong error
+           messages in some special cases (see bpo-31229). */
+        PyErr_Format(PyExc_TypeError,
+                     "%.200s%s takes at most %d %sargument%s (%zd given)",
+                     (fname == NULL) ? "function" : fname,
+                     (fname == NULL) ? "" : "()",
+                     len,
+                     (nargs == 0) ? "keyword " : "",
+                     (len == 1) ? "" : "s",
+                     nargs + nkwargs);
+        return 0;
+    }
+
+    /* convert tuple args and keyword args in same loop, using kwlist to drive process */
+    for (i = 0; i < len; i++) {
+        if (*format == '|') {
+#ifdef DEBUG
+            if (min != INT_MAX) {
+                PyErr_SetString(PyExc_SystemError,
+                                "Invalid format string (| specified twice)");
+                return 0;
+            }
+#endif
+
+            min = i;
+            format++;
+
+#ifdef DEBUG
+            if (max != INT_MAX) {
+                PyErr_SetString(PyExc_SystemError,
+                                "Invalid format string ($ before |)");
+                return 0;
+            }
+#endif
+
+            /* If there are optional args, figure out whether we have
+             * required keyword arguments so that we don't bail without
+             * enforcing them. */
+            has_required_kws = strchr(format, '@') != NULL;
+        }
+        if (*format == '$') {
+#ifdef DEBUG
+            if (max != INT_MAX) {
+                PyErr_SetString(PyExc_SystemError,
+                                "Invalid format string ($ specified twice)");
+                return 0;
+            }
+#endif
+
+            max = i;
+            format++;
+
+#ifdef DEBUG
+            if (max < pos) {
+                PyErr_SetString(PyExc_SystemError,
+                                "Empty parameter name after $");
+                return 0;
+            }
+#endif
+            if (skip) {
+                /* Now we know the minimal and the maximal numbers of
+                 * positional arguments and can raise an exception with
+                 * informative message (see below). */
+                break;
+            }
+            if (unlikely(max < nargs && !p_args)) {
+                if (max == 0) {
+                    PyErr_Format(PyExc_TypeError,
+                                 "%.200s%s takes no positional arguments",
+                                 (fname == NULL) ? "function" : fname,
+                                 (fname == NULL) ? "" : "()");
+                }
+                else {
+                    PyErr_Format(PyExc_TypeError,
+                                 "%.200s%s takes %s %d positional argument%s"
+                                 " (%zd given)",
+                                 (fname == NULL) ? "function" : fname,
+                                 (fname == NULL) ? "" : "()",
+                                 (min < max) ? "at most" : "exactly",
+                                 max,
+                                 max == 1 ? "" : "s",
+                                 nargs);
+                }
+                return 0;
+            }
+        }
+        if (*format == '@') {
+#ifdef DEBUG
+            if (min == INT_MAX && max == INT_MAX) {
+                PyErr_SetString(PyExc_SystemError,
+                                "Invalid format string "
+                                "(@ without preceding | and $)");
+                return 0;
+            }
+            if (required_kwonly_start != INT_MAX) {
+                PyErr_SetString(PyExc_SystemError,
+                                "Invalid format string (@ specified twice)");
+                return 0;
+            }
+#endif
+
+            required_kwonly_start = i;
+            format++;
+        }
+#ifdef DEBUG
+        if (IS_END_OF_FORMAT(*format)) {
+            PyErr_Format(PyExc_SystemError,
+                         "More keyword list entries (%d) than "
+                         "format specifiers (%d)", len, i);
+            return 0;
+        }
+#endif
+        if (!skip) {
+            if (i < nargs && i < max) {
+                current_arg = Py_NewRef(PyTuple_GET_ITEM(args, i));
+            }
+            else if (nkwargs && i >= pos) {
+                if (unlikely(PyDict_GetItemStringRef(kwargs, kwlist[i], &current_arg) < 0)) {
+                    return 0;
+                }
+                if (current_arg) {
+                    --nkwargs;
+                }
+            }
+            else {
+                current_arg = NULL;
+            }
+
+            if (current_arg) {
+                PyObject **p = va_arg(*p_va, PyObject **);
+                *p = current_arg;
+                Py_DECREF(current_arg);
+                format++;
+                continue;
+            }
+
+            if (i < min || i >= required_kwonly_start) {
+                if (likely(i < pos)) {
+                    assert (min == INT_MAX);
+                    assert (max == INT_MAX);
+                    skip = 1;
+                    /* At that moment we still don't know the minimal and
+                     * the maximal numbers of positional arguments.  Raising
+                     * an exception is deferred until we encounter | and $
+                     * or the end of the format. */
+                }
+                else {
+                    if (i >= max) {
+                        PyErr_Format(PyExc_TypeError,
+                                     "%.200s%s missing required "
+                                     "keyword-only argument '%s'",
+                                     (fname == NULL) ? "function" : fname,
+                                     (fname == NULL) ? "" : "()",
+                                     kwlist[i]);
+                    }
+                    else {
+                        PyErr_Format(PyExc_TypeError,
+                                     "%.200s%s missing required "
+                                     "argument '%s' (pos %d)",
+                                     (fname == NULL) ? "function" : fname,
+                                     (fname == NULL) ? "" : "()",
+                                     kwlist[i], i+1);
+                    }
+                    return 0;
+                }
+            }
+            /* current code reports success when all required args
+             * fulfilled and no keyword args left, with no further
+             * validation. XXX Maybe skip this in debug build ?
+             */
+            if (!nkwargs && !skip && !has_required_kws &&
+                !p_args && !p_kwargs)
+            {
+                return 1;
+            }
+        }
+
+        /* We are into optional args, skip through to any remaining
+         * keyword args */
+        skipitem(&format, p_va);
+    }
+
+    if (unlikely(skip)) {
+        PyErr_Format(PyExc_TypeError,
+                     "%.200s%s takes %s %d positional argument%s"
+                     " (%zd given)",
+                     (fname == NULL) ? "function" : fname,
+                     (fname == NULL) ? "" : "()",
+                     (Py_MIN(pos, min) < i) ? "at least" : "exactly",
+                     Py_MIN(pos, min),
+                     Py_MIN(pos, min) == 1 ? "" : "s",
+                     nargs);
+        return 0;
+    }
+
+#ifdef DEBUG
+    if (!IS_END_OF_FORMAT(*format) &&
+        (*format != '|') && (*format != '$') && (*format != '@'))
+    {
+        PyErr_Format(PyExc_SystemError,
+            "more argument specifiers than keyword list entries "
+            "(remaining format:'%s')", format);
+        return 0;
+    }
+#endif
+
+    bound_pos_args = Py_MIN(nargs, Py_MIN(max, len));
+    if (p_args) {
+        *p_args = PyTuple_GetSlice(args, bound_pos_args, nargs);
+        if (!*p_args) {
+            return 0;
+        }
+    }
+
+    if (p_kwargs) {
+        /* This unfortunately needs to be special cased because if len is 0 then we
+         * never go through the main loop. */
+        if (unlikely(nargs > 0 && len == 0 && !p_args)) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.200s%s takes no positional arguments",
+                         (fname == NULL) ? "function" : fname,
+                         (fname == NULL) ? "" : "()");
+
+            return 0;
+        }
+
+        *p_kwargs = PyDict_New();
+        if (!*p_kwargs) {
+            goto latefail;
+        }
+    }
+
+    if (nkwargs > 0) {
+        PyObject *key, *value;
+        Py_ssize_t j;
+        /* make sure there are no arguments given by name and position */
+        for (i = pos; i < bound_pos_args && i < len; i++) {
+            PyObject *current_arg;
+            if (unlikely(PyDict_GetItemStringRef(kwargs, kwlist[i], &current_arg) < 0)) {
+                goto latefail;
+            }
+            if (unlikely(current_arg != NULL)) {
+                Py_DECREF(current_arg);
+                /* arg present in tuple and in dict */
+                PyErr_Format(PyExc_TypeError,
+                             "argument for %.200s%s given by name ('%s') "
+                             "and position (%d)",
+                             (fname == NULL) ? "function" : fname,
+                             (fname == NULL) ? "" : "()",
+                             kwlist[i], i+1);
+                goto latefail;
+            }
+        }
+        /* make sure there are no extraneous keyword arguments */
+        j = 0;
+        while (PyDict_Next(kwargs, &j, &key, &value)) {
+            int match = 0;
+            if (unlikely(!PyUnicode_Check(key))) {
+                PyErr_SetString(PyExc_TypeError,
+                                "keywords must be strings");
+                goto latefail;
+            }
+            for (i = pos; i < len; i++) {
+                if (PyUnicode_EqualToUTF8(key, kwlist[i])) {
+                    match = 1;
+                    break;
+                }
+            }
+            if (!match) {
+                if (unlikely(!p_kwargs)) {
+                    PyErr_Format(PyExc_TypeError,
+                                 "'%U' is an invalid keyword "
+                                 "argument for %.200s%s",
+                                 key,
+                                 (fname == NULL) ? "this function" : fname,
+                                 (fname == NULL) ? "" : "()");
+                    goto latefail;
+                } else {
+                    if (PyDict_SetItem(*p_kwargs, key, value) < 0) {
+                        goto latefail;
+                    }
+                }
+            }
+        }
+    }
+
+    return 1;
+    /* Handle failures that have happened after we have tried to
+     * create *args and **kwargs, if they exist. */
+latefail:
+    if (p_args) {
+        Py_XDECREF(*p_args);
+    }
+    if (p_kwargs) {
+        Py_XDECREF(*p_kwargs);
+    }
+    return 0;
+}
+
+
+static void
+skipitem(const char **p_format, va_list *p_va)
+{
+    const char *format = *p_format;
+    char c = *format++;
+
+    if (p_va != NULL) {
+        (void) va_arg(*p_va, PyObject **);
+    }
+
+    *p_format = format;
+}
+
+#ifdef __cplusplus
+};
+#endif
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/getargsfast.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/getargsfast.c
@@ -0,0 +1,569 @@
+/* getargskeywordsfast implementation copied from Python 3.9 and stripped down to
+ * only include the functionality we need.
+ *
+ * We also add support for required kwonly args and accepting *args / **kwargs.
+ *
+ * DOCUMENTATION OF THE EXTENSIONS:
+ *  - Arguments given after a @ format specify required keyword-only arguments.
+ *    The | and $ specifiers must both appear before @.
+ *  - If the first character of a format string is %, then the function can support
+ *    *args and/or **kwargs. In this case the parser will consume two arguments,
+ *    which should be pointers to variables to store the *args and **kwargs, respectively.
+ *    Either pointer can be NULL, in which case the function doesn't take that
+ *    variety of vararg.
+ *    Unlike most format specifiers, the caller takes ownership of these objects
+ *    and is responsible for decrefing them.
+ */
+
+#include <Python.h>
+#include "CPy.h"
+
+#define PARSER_INITED(parser) ((parser)->kwtuple != NULL)
+
+/* Forward */
+static int
+vgetargskeywordsfast_impl(PyObject *const *args, Py_ssize_t nargs,
+                          PyObject *kwargs, PyObject *kwnames,
+                          CPyArg_Parser *parser,
+                          va_list *p_va);
+static void skipitem_fast(const char **, va_list *);
+
+/* Parse args for an arbitrary signature */
+int
+CPyArg_ParseStackAndKeywords(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
+                             CPyArg_Parser *parser, ...)
+{
+    int retval;
+    va_list va;
+
+    va_start(va, parser);
+    retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
+    va_end(va);
+    return retval;
+}
+
+/* Parse args for a function that takes no args */
+int
+CPyArg_ParseStackAndKeywordsNoArgs(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
+                                   CPyArg_Parser *parser, ...)
+{
+    int retval;
+    va_list va;
+
+    va_start(va, parser);
+    if (nargs == 0 && kwnames == NULL) {
+        // Fast path: no arguments
+        retval = 1;
+    } else {
+        retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
+    }
+    va_end(va);
+    return retval;
+}
+
+/* Parse args for a function that takes one arg */
+int
+CPyArg_ParseStackAndKeywordsOneArg(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
+                                   CPyArg_Parser *parser, ...)
+{
+    int retval;
+    va_list va;
+
+    va_start(va, parser);
+    if (kwnames == NULL && nargs == 1) {
+        // Fast path: one positional argument
+        PyObject **p;
+        p = va_arg(va, PyObject **);
+        *p = args[0];
+        retval = 1;
+    } else {
+        retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
+    }
+    va_end(va);
+    return retval;
+}
+
+/* Parse args for a function that takes no keyword-only args, *args or **kwargs */
+int
+CPyArg_ParseStackAndKeywordsSimple(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
+                                   CPyArg_Parser *parser, ...)
+{
+    int retval;
+    va_list va;
+
+    va_start(va, parser);
+    if (kwnames == NULL && PARSER_INITED(parser) &&
+            nargs >= parser->min && nargs <= parser->max) {
+        // Fast path: correct number of positional arguments only
+        PyObject **p;
+        Py_ssize_t i;
+        for (i = 0; i < nargs; i++) {
+            p = va_arg(va, PyObject **);
+            *p = args[i];
+        }
+        retval = 1;
+    } else {
+        retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
+    }
+    va_end(va);
+    return retval;
+}
+
+#define IS_END_OF_FORMAT(c) (c == '\0' || c == ';' || c == ':')
+
+
+/* List of static parsers. */
+static struct CPyArg_Parser *static_arg_parsers = NULL;
+
+static int
+parser_init(CPyArg_Parser *parser)
+{
+    const char * const *keywords;
+    const char *format, *msg;
+    int i, len, min, max, nkw;
+    PyObject *kwtuple;
+
+    assert(parser->keywords != NULL);
+    if (PARSER_INITED(parser)) {
+        return 1;
+    }
+
+    keywords = parser->keywords;
+    /* scan keywords and count the number of positional-only parameters */
+    for (i = 0; keywords[i] && !*keywords[i]; i++) {
+    }
+    parser->pos = i;
+    /* scan keywords and get greatest possible nbr of args */
+    for (; keywords[i]; i++) {
+        if (!*keywords[i]) {
+            PyErr_SetString(PyExc_SystemError,
+                            "Empty keyword parameter name");
+            return 0;
+        }
+    }
+    len = i;
+
+    parser->required_kwonly_start = INT_MAX;
+    if (*parser->format == '%') {
+        parser->format++;
+        parser->varargs = 1;
+    }
+
+    format = parser->format;
+    if (format) {
+        /* grab the function name or custom error msg first (mutually exclusive) */
+        parser->fname = strchr(parser->format, ':');
+        if (parser->fname) {
+            parser->fname++;
+            parser->custom_msg = NULL;
+        }
+        else {
+            parser->custom_msg = strchr(parser->format,';');
+            if (parser->custom_msg)
+                parser->custom_msg++;
+        }
+
+        min = max = INT_MAX;
+        for (i = 0; i < len; i++) {
+            if (*format == '|') {
+                if (min != INT_MAX) {
+                    PyErr_SetString(PyExc_SystemError,
+                                    "Invalid format string (| specified twice)");
+                    return 0;
+                }
+                if (max != INT_MAX) {
+                    PyErr_SetString(PyExc_SystemError,
+                                    "Invalid format string ($ before |)");
+                    return 0;
+                }
+                min = i;
+                format++;
+            }
+            if (*format == '$') {
+                if (max != INT_MAX) {
+                    PyErr_SetString(PyExc_SystemError,
+                                    "Invalid format string ($ specified twice)");
+                    return 0;
+                }
+                if (i < parser->pos) {
+                    PyErr_SetString(PyExc_SystemError,
+                                    "Empty parameter name after $");
+                    return 0;
+                }
+                max = i;
+                format++;
+            }
+            if (*format == '@') {
+                if (parser->required_kwonly_start != INT_MAX) {
+                    PyErr_SetString(PyExc_SystemError,
+                                    "Invalid format string (@ specified twice)");
+                    return 0;
+                }
+                if (min == INT_MAX && max == INT_MAX) {
+                    PyErr_SetString(PyExc_SystemError,
+                                    "Invalid format string "
+                                    "(@ without preceding | and $)");
+                    return 0;
+                }
+                format++;
+                parser->has_required_kws = 1;
+                parser->required_kwonly_start = i;
+            }
+            if (IS_END_OF_FORMAT(*format)) {
+                PyErr_Format(PyExc_SystemError,
+                            "More keyword list entries (%d) than "
+                            "format specifiers (%d)", len, i);
+                return 0;
+            }
+
+            skipitem_fast(&format, NULL);
+        }
+        parser->min = Py_MIN(min, len);
+        parser->max = Py_MIN(max, len);
+
+        if (!IS_END_OF_FORMAT(*format) && (*format != '|') && (*format != '$')) {
+            PyErr_Format(PyExc_SystemError,
+                "more argument specifiers than keyword list entries "
+                "(remaining format:'%s')", format);
+            return 0;
+        }
+    }
+
+    nkw = len - parser->pos;
+    kwtuple = PyTuple_New(nkw);
+    if (kwtuple == NULL) {
+        return 0;
+    }
+    keywords = parser->keywords + parser->pos;
+    for (i = 0; i < nkw; i++) {
+        PyObject *str = PyUnicode_FromString(keywords[i]);
+        if (str == NULL) {
+            Py_DECREF(kwtuple);
+            return 0;
+        }
+        PyUnicode_InternInPlace(&str);
+        PyTuple_SET_ITEM(kwtuple, i, str);
+    }
+    parser->kwtuple = kwtuple;
+
+    assert(parser->next == NULL);
+    parser->next = static_arg_parsers;
+    static_arg_parsers = parser;
+    return 1;
+}
+
+static PyObject*
+find_keyword(PyObject *kwnames, PyObject *const *kwstack, PyObject *key)
+{
+    Py_ssize_t i, nkwargs;
+
+    nkwargs = PyTuple_GET_SIZE(kwnames);
+    for (i = 0; i < nkwargs; i++) {
+        PyObject *kwname = PyTuple_GET_ITEM(kwnames, i);
+
+        /* kwname == key will normally find a match in since keyword keys
+           should be interned strings; if not retry below in a new loop. */
+        if (kwname == key) {
+            return kwstack[i];
+        }
+    }
+
+    for (i = 0; i < nkwargs; i++) {
+        PyObject *kwname = PyTuple_GET_ITEM(kwnames, i);
+        assert(PyUnicode_Check(kwname));
+        if (PyUnicode_Equal(kwname, key)) {
+            return kwstack[i];
+        }
+    }
+    return NULL;
+}
+
+static int
+vgetargskeywordsfast_impl(PyObject *const *args, Py_ssize_t nargs,
+                          PyObject *kwargs, PyObject *kwnames,
+                          CPyArg_Parser *parser,
+                          va_list *p_va)
+{
+    PyObject *kwtuple;
+    const char *format;
+    PyObject *keyword;
+    int i, pos, len;
+    Py_ssize_t nkwargs;
+    PyObject *current_arg;
+    PyObject *const *kwstack = NULL;
+    int bound_pos_args;
+    PyObject **p_args = NULL, **p_kwargs = NULL;
+
+    assert(kwargs == NULL || PyDict_Check(kwargs));
+    assert(kwargs == NULL || kwnames == NULL);
+    assert(p_va != NULL);
+
+    if (!parser_init(parser)) {
+        return 0;
+    }
+
+    kwtuple = parser->kwtuple;
+    pos = parser->pos;
+    len = pos + (int)PyTuple_GET_SIZE(kwtuple);
+
+    if (parser->varargs) {
+        p_args = va_arg(*p_va, PyObject **);
+        p_kwargs = va_arg(*p_va, PyObject **);
+    }
+
+    if (kwargs != NULL) {
+        nkwargs = PyDict_GET_SIZE(kwargs);
+    }
+    else if (kwnames != NULL) {
+        nkwargs = PyTuple_GET_SIZE(kwnames);
+        kwstack = args + nargs;
+    }
+    else {
+        nkwargs = 0;
+    }
+    if (nargs + nkwargs > len && !p_args && !p_kwargs) {
+        /* Adding "keyword" (when nargs == 0) prevents producing wrong error
+           messages in some special cases (see bpo-31229). */
+        PyErr_Format(PyExc_TypeError,
+                     "%.200s%s takes at most %d %sargument%s (%zd given)",
+                     (parser->fname == NULL) ? "function" : parser->fname,
+                     (parser->fname == NULL) ? "" : "()",
+                     len,
+                     (nargs == 0) ? "keyword " : "",
+                     (len == 1) ? "" : "s",
+                     nargs + nkwargs);
+        return 0;
+    }
+    if (parser->max < nargs && !p_args) {
+        if (parser->max == 0) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.200s%s takes no positional arguments",
+                         (parser->fname == NULL) ? "function" : parser->fname,
+                         (parser->fname == NULL) ? "" : "()");
+        }
+        else {
+            PyErr_Format(PyExc_TypeError,
+                         "%.200s%s takes %s %d positional argument%s (%zd given)",
+                         (parser->fname == NULL) ? "function" : parser->fname,
+                         (parser->fname == NULL) ? "" : "()",
+                         (parser->min < parser->max) ? "at most" : "exactly",
+                         parser->max,
+                         parser->max == 1 ? "" : "s",
+                         nargs);
+        }
+        return 0;
+    }
+
+    format = parser->format;
+
+    /* convert tuple args and keyword args in same loop, using kwtuple to drive process */
+    for (i = 0; i < len; i++) {
+        if (*format == '|') {
+            format++;
+        }
+        if (*format == '$') {
+            format++;
+        }
+        if (*format == '@') {
+            format++;
+        }
+        assert(!IS_END_OF_FORMAT(*format));
+
+        if (i < nargs && i < parser->max) {
+            current_arg = args[i];
+        }
+        else if (nkwargs && i >= pos) {
+            keyword = PyTuple_GET_ITEM(kwtuple, i - pos);
+            if (kwargs != NULL) {
+                current_arg = PyDict_GetItemWithError(kwargs, keyword);
+                if (!current_arg && PyErr_Occurred()) {
+                    return 0;
+                }
+            }
+            else {
+                current_arg = find_keyword(kwnames, kwstack, keyword);
+            }
+            if (current_arg) {
+                --nkwargs;
+            }
+        }
+        else {
+            current_arg = NULL;
+        }
+
+        if (current_arg) {
+            PyObject **p = va_arg(*p_va, PyObject **);
+            *p = current_arg;
+            format++;
+            continue;
+        }
+
+        if (i < parser->min || i >= parser->required_kwonly_start) {
+            /* Less arguments than required */
+            if (i < pos) {
+                Py_ssize_t min = Py_MIN(pos, parser->min);
+                PyErr_Format(PyExc_TypeError,
+                             "%.200s%s takes %s %d positional argument%s"
+                             " (%zd given)",
+                             (parser->fname == NULL) ? "function" : parser->fname,
+                             (parser->fname == NULL) ? "" : "()",
+                             min < parser->max ? "at least" : "exactly",
+                             min,
+                             min == 1 ? "" : "s",
+                             nargs);
+            }
+            else {
+                keyword = PyTuple_GET_ITEM(kwtuple, i - pos);
+                if (i >= parser->max) {
+                    PyErr_Format(PyExc_TypeError,  "%.200s%s missing required "
+                                 "keyword-only argument '%U'",
+                                 (parser->fname == NULL) ? "function" : parser->fname,
+                                 (parser->fname == NULL) ? "" : "()",
+                                 keyword);
+                }
+                else {
+                    PyErr_Format(PyExc_TypeError,  "%.200s%s missing required "
+                                 "argument '%U' (pos %d)",
+                                 (parser->fname == NULL) ? "function" : parser->fname,
+                                 (parser->fname == NULL) ? "" : "()",
+                                 keyword, i+1);
+                }
+            }
+            return 0;
+        }
+        /* current code reports success when all required args
+         * fulfilled and no keyword args left, with no further
+         * validation. XXX Maybe skip this in debug build ?
+         */
+        if (!nkwargs && !parser->has_required_kws && !p_args && !p_kwargs) {
+            return 1;
+        }
+
+        /* We are into optional args, skip through to any remaining
+         * keyword args */
+        skipitem_fast(&format, p_va);
+    }
+
+    assert(IS_END_OF_FORMAT(*format) || (*format == '|') || (*format == '$'));
+
+    bound_pos_args = Py_MIN(nargs, Py_MIN(parser->max, len));
+    if (p_args) {
+        *p_args = PyTuple_New(nargs - bound_pos_args);
+        if (!*p_args) {
+            return 0;
+        }
+        for (i = bound_pos_args; i < nargs; i++) {
+            PyObject *arg = args[i];
+            Py_INCREF(arg);
+            PyTuple_SET_ITEM(*p_args, i - bound_pos_args, arg);
+        }
+    }
+
+    if (p_kwargs) {
+        /* This unfortunately needs to be special cased because if len is 0 then we
+         * never go through the main loop. */
+        if (nargs > 0 && len == 0 && !p_args) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.200s%s takes no positional arguments",
+                         (parser->fname == NULL) ? "function" : parser->fname,
+                         (parser->fname == NULL) ? "" : "()");
+
+            return 0;
+        }
+
+        *p_kwargs = PyDict_New();
+        if (!*p_kwargs) {
+            goto latefail;
+        }
+    }
+
+    if (nkwargs > 0) {
+        Py_ssize_t j;
+        PyObject *value;
+        /* make sure there are no arguments given by name and position */
+        for (i = pos; i < bound_pos_args; i++) {
+            keyword = PyTuple_GET_ITEM(kwtuple, i - pos);
+            if (kwargs != NULL) {
+                current_arg = PyDict_GetItemWithError(kwargs, keyword);
+                if (!current_arg && PyErr_Occurred()) {
+                    goto latefail;
+                }
+            }
+            else {
+                current_arg = find_keyword(kwnames, kwstack, keyword);
+            }
+            if (current_arg) {
+                /* arg present in tuple and in dict */
+                PyErr_Format(PyExc_TypeError,
+                             "argument for %.200s%s given by name ('%U') "
+                             "and position (%d)",
+                             (parser->fname == NULL) ? "function" : parser->fname,
+                             (parser->fname == NULL) ? "" : "()",
+                             keyword, i+1);
+                goto latefail;
+            }
+        }
+        /* make sure there are no extraneous keyword arguments */
+        j = 0;
+        while (1) {
+            int match;
+            if (kwargs != NULL) {
+                if (!PyDict_Next(kwargs, &j, &keyword, &value))
+                    break;
+            }
+            else {
+                if (j >= PyTuple_GET_SIZE(kwnames))
+                    break;
+                keyword = PyTuple_GET_ITEM(kwnames, j);
+                value = kwstack[j];
+                j++;
+            }
+
+            match = PySequence_Contains(kwtuple, keyword);
+            if (match <= 0) {
+                if (!match) {
+                    if (!p_kwargs) {
+                        PyErr_Format(PyExc_TypeError,
+                                     "'%S' is an invalid keyword "
+                                     "argument for %.200s%s",
+                                     keyword,
+                                     (parser->fname == NULL) ? "this function" : parser->fname,
+                                     (parser->fname == NULL) ? "" : "()");
+                        goto latefail;
+                    } else {
+                        if (PyDict_SetItem(*p_kwargs, keyword, value) < 0) {
+                            goto latefail;
+                        }
+                    }
+                } else {
+                    goto latefail;
+                }
+            }
+        }
+    }
+
+    return 1;
+    /* Handle failures that have happened after we have tried to
+     * create *args and **kwargs, if they exist. */
+latefail:
+    if (p_args) {
+        Py_XDECREF(*p_args);
+    }
+    if (p_kwargs) {
+        Py_XDECREF(*p_kwargs);
+    }
+    return 0;
+}
+
+static void
+skipitem_fast(const char **p_format, va_list *p_va)
+{
+    const char *format = *p_format;
+    char c = *format++;
+
+    if (p_va != NULL) {
+        (void) va_arg(*p_va, PyObject **);
+    }
+
+    *p_format = format;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/init.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/init.c
@@ -0,0 +1,24 @@
+#include <Python.h>
+#include "CPy.h"
+
+struct ExcDummyStruct _CPy_ExcDummyStruct = { PyObject_HEAD_INIT(NULL) };
+PyObject *_CPy_ExcDummy = (PyObject *)&_CPy_ExcDummyStruct;
+
+// System-wide empty tuple constant
+PyObject * __mypyc_empty_tuple__ = NULL;
+
+// Because its dynamic linker is more restricted than linux/OS X,
+// Windows doesn't allow initializing globals with values from
+// other dynamic libraries. This means we need to initialize
+// things at load time.
+void CPy_Init(void) {
+    _CPy_ExcDummyStruct.ob_base.ob_type = &PyBaseObject_Type;
+
+    // Initialize system-wide empty tuple constant
+    if (__mypyc_empty_tuple__ == NULL) {
+        __mypyc_empty_tuple__ = PyTuple_New(0);
+        if (!__mypyc_empty_tuple__) {
+            CPyError_OutOfMemory();
+        }
+    }
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/int_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/int_ops.c
@@ -0,0 +1,647 @@
+// Int primitive operations (tagged arbitrary-precision integers)
+//
+// These are registered in mypyc.primitives.int_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#endif
+
+#ifndef _WIN32
+// On 64-bit Linux and macOS, ssize_t and long are both 64 bits, and
+// PyLong_FromLong is faster than PyLong_FromSsize_t, so use the faster one
+#define CPyLong_FromSsize_t PyLong_FromLong
+#else
+// On 64-bit Windows, ssize_t is 64 bits but long is 32 bits, so we
+// can't use the above trick
+#define CPyLong_FromSsize_t PyLong_FromSsize_t
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#  if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8)
+#    define CPY_CLZ(x) __builtin_clzll((unsigned long long)(x))
+#    define CPY_BITS 64
+#  else
+#    define CPY_CLZ(x) __builtin_clz((unsigned int)(x))
+#    define CPY_BITS 32
+#  endif
+#endif
+
+
+CPyTagged CPyTagged_FromSsize_t(Py_ssize_t value) {
+    // We use a Python object if the value shifted left by 1 is too
+    // large for Py_ssize_t
+    if (unlikely(CPyTagged_TooBig(value))) {
+        PyObject *object = PyLong_FromSsize_t(value);
+        return ((CPyTagged)object) | CPY_INT_TAG;
+    } else {
+        return value << 1;
+    }
+}
+
+CPyTagged CPyTagged_FromVoidPtr(void *ptr) {
+    if ((uintptr_t)ptr > PY_SSIZE_T_MAX) {
+        PyObject *object = PyLong_FromVoidPtr(ptr);
+        return ((CPyTagged)object) | CPY_INT_TAG;
+    } else {
+        return CPyTagged_FromSsize_t((Py_ssize_t)ptr);
+    }
+}
+
+CPyTagged CPyTagged_FromInt64(int64_t value) {
+    if (unlikely(CPyTagged_TooBigInt64(value))) {
+        PyObject *object = PyLong_FromLongLong(value);
+        return ((CPyTagged)object) | CPY_INT_TAG;
+    } else {
+        return value << 1;
+    }
+}
+
+PyObject *CPyTagged_AsObject(CPyTagged x) {
+    PyObject *value;
+    if (unlikely(CPyTagged_CheckLong(x))) {
+        value = CPyTagged_LongAsObject(x);
+        Py_INCREF(value);
+    } else {
+        value = CPyLong_FromSsize_t(CPyTagged_ShortAsSsize_t(x));
+        if (value == NULL) {
+            CPyError_OutOfMemory();
+        }
+    }
+    return value;
+}
+
+PyObject *CPyTagged_StealAsObject(CPyTagged x) {
+    PyObject *value;
+    if (unlikely(CPyTagged_CheckLong(x))) {
+        value = CPyTagged_LongAsObject(x);
+    } else {
+        value = CPyLong_FromSsize_t(CPyTagged_ShortAsSsize_t(x));
+        if (value == NULL) {
+            CPyError_OutOfMemory();
+        }
+    }
+    return value;
+}
+
+Py_ssize_t CPyTagged_AsSsize_t(CPyTagged x) {
+    if (likely(CPyTagged_CheckShort(x))) {
+        return CPyTagged_ShortAsSsize_t(x);
+    } else {
+        return PyLong_AsSsize_t(CPyTagged_LongAsObject(x));
+    }
+}
+
+CPy_NOINLINE
+void CPyTagged_IncRef(CPyTagged x) {
+    if (unlikely(CPyTagged_CheckLong(x))) {
+        Py_INCREF(CPyTagged_LongAsObject(x));
+    }
+}
+
+CPy_NOINLINE
+void CPyTagged_DecRef(CPyTagged x) {
+    if (unlikely(CPyTagged_CheckLong(x))) {
+        Py_DECREF(CPyTagged_LongAsObject(x));
+    }
+}
+
+CPy_NOINLINE
+void CPyTagged_XDecRef(CPyTagged x) {
+    if (unlikely(CPyTagged_CheckLong(x))) {
+        Py_XDECREF(CPyTagged_LongAsObject(x));
+    }
+}
+
+// Tagged int negation slow path, where the result may be a long integer
+CPyTagged CPyTagged_Negate_(CPyTagged num) {
+    PyObject *num_obj = CPyTagged_AsObject(num);
+    PyObject *result = PyNumber_Negative(num_obj);
+    if (result == NULL) {
+        CPyError_OutOfMemory();
+    }
+    Py_DECREF(num_obj);
+    return CPyTagged_StealFromObject(result);
+}
+
+// Tagged int addition slow path, where the result may be a long integer
+CPyTagged CPyTagged_Add_(CPyTagged left, CPyTagged right) {
+    PyObject *left_obj = CPyTagged_AsObject(left);
+    PyObject *right_obj = CPyTagged_AsObject(right);
+    PyObject *result = PyNumber_Add(left_obj, right_obj);
+    if (result == NULL) {
+        CPyError_OutOfMemory();
+    }
+    Py_DECREF(left_obj);
+    Py_DECREF(right_obj);
+    return CPyTagged_StealFromObject(result);
+}
+
+// Tagged int subtraction slow path, where the result may be a long integer
+CPyTagged CPyTagged_Subtract_(CPyTagged left, CPyTagged right) {
+    PyObject *left_obj = CPyTagged_AsObject(left);
+    PyObject *right_obj = CPyTagged_AsObject(right);
+    PyObject *result = PyNumber_Subtract(left_obj, right_obj);
+    if (result == NULL) {
+        CPyError_OutOfMemory();
+    }
+    Py_DECREF(left_obj);
+    Py_DECREF(right_obj);
+    return CPyTagged_StealFromObject(result);
+}
+
+// Tagged int multiplication slow path, where the result may be a long integer
+CPyTagged CPyTagged_Multiply_(CPyTagged left, CPyTagged right) {
+    PyObject *left_obj = CPyTagged_AsObject(left);
+    PyObject *right_obj = CPyTagged_AsObject(right);
+    PyObject *result = PyNumber_Multiply(left_obj, right_obj);
+    if (result == NULL) {
+        CPyError_OutOfMemory();
+    }
+    Py_DECREF(left_obj);
+    Py_DECREF(right_obj);
+    return CPyTagged_StealFromObject(result);
+}
+
+// Tagged int // slow path, where the result may be a long integer (or raise)
+CPyTagged CPyTagged_FloorDivide_(CPyTagged left, CPyTagged right) {
+    PyObject *left_obj = CPyTagged_AsObject(left);
+    PyObject *right_obj = CPyTagged_AsObject(right);
+    PyObject *result = PyNumber_FloorDivide(left_obj, right_obj);
+    Py_DECREF(left_obj);
+    Py_DECREF(right_obj);
+    // Handle exceptions honestly because it could be ZeroDivisionError
+    if (result == NULL) {
+        return CPY_INT_TAG;
+    } else {
+        return CPyTagged_StealFromObject(result);
+    }
+}
+
+// Tagged int % slow path, where the result may be a long integer (or raise)
+CPyTagged CPyTagged_Remainder_(CPyTagged left, CPyTagged right) {
+    PyObject *left_obj = CPyTagged_AsObject(left);
+    PyObject *right_obj = CPyTagged_AsObject(right);
+    PyObject *result = PyNumber_Remainder(left_obj, right_obj);
+    Py_DECREF(left_obj);
+    Py_DECREF(right_obj);
+    // Handle exceptions honestly because it could be ZeroDivisionError
+    if (result == NULL) {
+        return CPY_INT_TAG;
+    } else {
+        return CPyTagged_StealFromObject(result);
+    }
+}
+
+bool CPyTagged_IsEq_(CPyTagged left, CPyTagged right) {
+    if (CPyTagged_CheckShort(right)) {
+        return false;
+    } else {
+        PyObject *left_obj = CPyTagged_AsObject(left);
+        PyObject *right_obj = CPyTagged_AsObject(right);
+        int result = PyObject_RichCompareBool(left_obj, right_obj, Py_EQ);
+        Py_DECREF(left_obj);
+        Py_DECREF(right_obj);
+        if (result == -1) {
+            CPyError_OutOfMemory();
+        }
+        return result;
+    }
+}
+
+bool CPyTagged_IsLt_(CPyTagged left, CPyTagged right) {
+    PyObject *left_obj = CPyTagged_AsObject(left);
+    PyObject *right_obj = CPyTagged_AsObject(right);
+    int result = PyObject_RichCompareBool(left_obj, right_obj, Py_LT);
+    Py_DECREF(left_obj);
+    Py_DECREF(right_obj);
+    if (result == -1) {
+        CPyError_OutOfMemory();
+    }
+    return result;
+}
+
+PyObject *CPyLong_FromStrWithBase(PyObject *o, CPyTagged base) {
+    Py_ssize_t base_size_t = CPyTagged_AsSsize_t(base);
+    return PyLong_FromUnicodeObject(o, base_size_t);
+}
+
+PyObject *CPyLong_FromStr(PyObject *o) {
+    CPyTagged base = CPyTagged_FromSsize_t(10);
+    return CPyLong_FromStrWithBase(o, base);
+}
+
+CPyTagged CPyTagged_FromFloat(double f) {
+    if (f < ((double)CPY_TAGGED_MAX + 1.0) && f > (CPY_TAGGED_MIN - 1.0)) {
+        return (Py_ssize_t)f << 1;
+    }
+    PyObject *o = PyLong_FromDouble(f);
+    if (o == NULL)
+        return CPY_INT_TAG;
+    return CPyTagged_StealFromObject(o);
+}
+
+PyObject *CPyBool_Str(bool b) {
+    return PyObject_Str(b ? Py_True : Py_False);
+}
+
+// Bitwise op '&', '|' or '^' using the generic (slow) API
+static CPyTagged GenericBitwiseOp(CPyTagged a, CPyTagged b, char op) {
+    PyObject *aobj = CPyTagged_AsObject(a);
+    PyObject *bobj = CPyTagged_AsObject(b);
+    PyObject *r;
+    if (op == '&') {
+        r = PyNumber_And(aobj, bobj);
+    } else if (op == '|') {
+        r = PyNumber_Or(aobj, bobj);
+    } else {
+        r = PyNumber_Xor(aobj, bobj);
+    }
+    if (unlikely(r == NULL)) {
+        CPyError_OutOfMemory();
+    }
+    Py_DECREF(aobj);
+    Py_DECREF(bobj);
+    return CPyTagged_StealFromObject(r);
+}
+
+// Return pointer to digits of a PyLong object. If it's a short
+// integer, place digits in the buffer buf instead to avoid memory
+// allocation (it's assumed to be big enough). Return the number of
+// digits in *size. *size is negative if the integer is negative.
+static digit *GetIntDigits(CPyTagged n, Py_ssize_t *size, digit *buf) {
+    if (CPyTagged_CheckShort(n)) {
+        Py_ssize_t val = CPyTagged_ShortAsSsize_t(n);
+        bool neg = val < 0;
+        int len = 1;
+        if (neg) {
+            val = -val;
+        }
+        buf[0] = val & PyLong_MASK;
+        if (val > (Py_ssize_t)PyLong_MASK) {
+            val >>= PyLong_SHIFT;
+            buf[1] = val & PyLong_MASK;
+            if (val > (Py_ssize_t)PyLong_MASK) {
+                buf[2] = val >> PyLong_SHIFT;
+                len = 3;
+            } else {
+                len = 2;
+            }
+        }
+        *size = neg ? -len : len;
+        return buf;
+    } else {
+        PyLongObject *obj = (PyLongObject *)CPyTagged_LongAsObject(n);
+        *size = CPY_LONG_SIZE_SIGNED(obj);
+        return &CPY_LONG_DIGIT(obj, 0);
+    }
+}
+
+// Shared implementation of bitwise '&', '|' and '^' (specified by op) for at least
+// one long operand. This is somewhat optimized for performance.
+CPyTagged CPyTagged_BitwiseLongOp_(CPyTagged a, CPyTagged b, char op) {
+    // Directly access the digits, as there is no fast C API function for this.
+    digit abuf[3];
+    digit bbuf[3];
+    Py_ssize_t asize;
+    Py_ssize_t bsize;
+    digit *adigits = GetIntDigits(a, &asize, abuf);
+    digit *bdigits = GetIntDigits(b, &bsize, bbuf);
+
+    if (unlikely(asize < 0 || bsize < 0)) {
+        // Negative operand. This is slower, but bitwise ops on them are pretty rare.
+        return GenericBitwiseOp(a, b, op);
+    }
+    // Optimized implementation for two non-negative integers.
+    // Swap a and b as needed to ensure a is no longer than b.
+    if (asize > bsize) {
+        digit *tmp = adigits;
+        adigits = bdigits;
+        bdigits = tmp;
+        Py_ssize_t tmp_size = asize;
+        asize = bsize;
+        bsize = tmp_size;
+    }
+    void *digits = NULL;
+    PyLongWriter *writer = PyLongWriter_Create(0, op == '&' ? asize : bsize, &digits);
+    if (unlikely(writer == NULL)) {
+        CPyError_OutOfMemory();
+    }
+    Py_ssize_t i;
+    if (op == '&') {
+        for (i = 0; i < asize; i++) {
+            ((digit *)digits)[i] = adigits[i] & bdigits[i];
+        }
+    } else {
+        if (op == '|') {
+            for (i = 0; i < asize; i++) {
+                ((digit *)digits)[i] = adigits[i] | bdigits[i];
+            }
+        } else {
+            for (i = 0; i < asize; i++) {
+                ((digit *)digits)[i] = adigits[i] ^ bdigits[i];
+            }
+        }
+        for (; i < bsize; i++) {
+            ((digit *)digits)[i] = bdigits[i];
+        }
+    }
+    return CPyTagged_StealFromObject(PyLongWriter_Finish(writer));
+}
+
+// Bitwise '~' slow path
+CPyTagged CPyTagged_Invert_(CPyTagged num) {
+    PyObject *obj = CPyTagged_AsObject(num);
+    PyObject *result = PyNumber_Invert(obj);
+    if (unlikely(result == NULL)) {
+        CPyError_OutOfMemory();
+    }
+    Py_DECREF(obj);
+    return CPyTagged_StealFromObject(result);
+}
+
+// Bitwise '>>' slow path
+CPyTagged CPyTagged_Rshift_(CPyTagged left, CPyTagged right) {
+    // Long integer or negative shift -- use generic op
+    PyObject *lobj = CPyTagged_AsObject(left);
+    PyObject *robj = CPyTagged_AsObject(right);
+    PyObject *result = PyNumber_Rshift(lobj, robj);
+    Py_DECREF(lobj);
+    Py_DECREF(robj);
+    if (result == NULL) {
+        // Propagate error (could be negative shift count)
+        return CPY_INT_TAG;
+    }
+    return CPyTagged_StealFromObject(result);
+}
+
+// Bitwise '<<' slow path
+CPyTagged CPyTagged_Lshift_(CPyTagged left, CPyTagged right) {
+    // Long integer or out of range shift -- use generic op
+    PyObject *lobj = CPyTagged_AsObject(left);
+    PyObject *robj = CPyTagged_AsObject(right);
+    PyObject *result = PyNumber_Lshift(lobj, robj);
+    Py_DECREF(lobj);
+    Py_DECREF(robj);
+    if (result == NULL) {
+        // Propagate error (could be negative shift count)
+        return CPY_INT_TAG;
+    }
+    return CPyTagged_StealFromObject(result);
+}
+
+// i64 unboxing slow path
+int64_t CPyLong_AsInt64_(PyObject *o) {
+    int overflow;
+    int64_t result = PyLong_AsLongLongAndOverflow(o, &overflow);
+    if (result == -1) {
+        if (PyErr_Occurred()) {
+            return CPY_LL_INT_ERROR;
+        } else if (overflow) {
+            PyErr_SetString(PyExc_OverflowError, "int too large to convert to i64");
+            return CPY_LL_INT_ERROR;
+        }
+    }
+    return result;
+}
+
+int64_t CPyInt64_Divide(int64_t x, int64_t y) {
+    if (y == 0) {
+        PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+        return CPY_LL_INT_ERROR;
+    }
+    if (y == -1 && x == INT64_MIN) {
+        PyErr_SetString(PyExc_OverflowError, "integer division overflow");
+        return CPY_LL_INT_ERROR;
+    }
+    int64_t d = x / y;
+    // Adjust for Python semantics
+    if (((x < 0) != (y < 0)) && d * y != x) {
+        d--;
+    }
+    return d;
+}
+
+int64_t CPyInt64_Remainder(int64_t x, int64_t y) {
+    if (y == 0) {
+        PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+        return CPY_LL_INT_ERROR;
+    }
+    // Edge case: avoid core dump
+    if (y == -1 && x == INT64_MIN) {
+        return 0;
+    }
+    int64_t d = x % y;
+    // Adjust for Python semantics
+    if (((x < 0) != (y < 0)) && d != 0) {
+        d += y;
+    }
+    return d;
+}
+
+// i32 unboxing slow path
+int32_t CPyLong_AsInt32_(PyObject *o) {
+    int overflow;
+    long result = PyLong_AsLongAndOverflow(o, &overflow);
+    if (result > 0x7fffffffLL || result < -0x80000000LL) {
+        overflow = 1;
+        result = -1;
+    }
+    if (result == -1) {
+        if (PyErr_Occurred()) {
+            return CPY_LL_INT_ERROR;
+        } else if (overflow) {
+            PyErr_SetString(PyExc_OverflowError, "int too large to convert to i32");
+            return CPY_LL_INT_ERROR;
+        }
+    }
+    return result;
+}
+
+int32_t CPyInt32_Divide(int32_t x, int32_t y) {
+    if (y == 0) {
+        PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+        return CPY_LL_INT_ERROR;
+    }
+    if (y == -1 && x == INT32_MIN) {
+        PyErr_SetString(PyExc_OverflowError, "integer division overflow");
+        return CPY_LL_INT_ERROR;
+    }
+    int32_t d = x / y;
+    // Adjust for Python semantics
+    if (((x < 0) != (y < 0)) && d * y != x) {
+        d--;
+    }
+    return d;
+}
+
+int32_t CPyInt32_Remainder(int32_t x, int32_t y) {
+    if (y == 0) {
+        PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+        return CPY_LL_INT_ERROR;
+    }
+    // Edge case: avoid core dump
+    if (y == -1 && x == INT32_MIN) {
+        return 0;
+    }
+    int32_t d = x % y;
+    // Adjust for Python semantics
+    if (((x < 0) != (y < 0)) && d != 0) {
+        d += y;
+    }
+    return d;
+}
+
+void CPyInt32_Overflow() {
+    PyErr_SetString(PyExc_OverflowError, "int too large to convert to i32");
+}
+
+// i16 unboxing slow path
+int16_t CPyLong_AsInt16_(PyObject *o) {
+    int overflow;
+    long result = PyLong_AsLongAndOverflow(o, &overflow);
+    if (result > 0x7fff || result < -0x8000) {
+        overflow = 1;
+        result = -1;
+    }
+    if (result == -1) {
+        if (PyErr_Occurred()) {
+            return CPY_LL_INT_ERROR;
+        } else if (overflow) {
+            PyErr_SetString(PyExc_OverflowError, "int too large to convert to i16");
+            return CPY_LL_INT_ERROR;
+        }
+    }
+    return result;
+}
+
+int16_t CPyInt16_Divide(int16_t x, int16_t y) {
+    if (y == 0) {
+        PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+        return CPY_LL_INT_ERROR;
+    }
+    if (y == -1 && x == INT16_MIN) {
+        PyErr_SetString(PyExc_OverflowError, "integer division overflow");
+        return CPY_LL_INT_ERROR;
+    }
+    int16_t d = x / y;
+    // Adjust for Python semantics
+    if (((x < 0) != (y < 0)) && d * y != x) {
+        d--;
+    }
+    return d;
+}
+
+int16_t CPyInt16_Remainder(int16_t x, int16_t y) {
+    if (y == 0) {
+        PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
+        return CPY_LL_INT_ERROR;
+    }
+    // Edge case: avoid core dump
+    if (y == -1 && x == INT16_MIN) {
+        return 0;
+    }
+    int16_t d = x % y;
+    // Adjust for Python semantics
+    if (((x < 0) != (y < 0)) && d != 0) {
+        d += y;
+    }
+    return d;
+}
+
+void CPyInt16_Overflow() {
+    PyErr_SetString(PyExc_OverflowError, "int too large to convert to i16");
+}
+
+// u8 unboxing slow path
+uint8_t CPyLong_AsUInt8_(PyObject *o) {
+    int overflow;
+    long result = PyLong_AsLongAndOverflow(o, &overflow);
+    if (result < 0 || result >= 256) {
+        overflow = 1;
+        result = -1;
+    }
+    if (result == -1) {
+        if (PyErr_Occurred()) {
+            return CPY_LL_UINT_ERROR;
+        } else if (overflow) {
+            PyErr_SetString(PyExc_OverflowError, "int too large or small to convert to u8");
+            return CPY_LL_UINT_ERROR;
+        }
+    }
+    return result;
+}
+
+void CPyUInt8_Overflow() {
+    PyErr_SetString(PyExc_OverflowError, "int too large or small to convert to u8");
+}
+
+double CPyTagged_TrueDivide(CPyTagged x, CPyTagged y) {
+    if (unlikely(y == 0)) {
+        PyErr_SetString(PyExc_ZeroDivisionError, "division by zero");
+        return CPY_FLOAT_ERROR;
+    }
+    if (likely(!CPyTagged_CheckLong(x) && !CPyTagged_CheckLong(y))) {
+        return (double)((Py_ssize_t)x >> 1) / (double)((Py_ssize_t)y >> 1);
+    } else {
+        PyObject *xo = CPyTagged_AsObject(x);
+        PyObject *yo = CPyTagged_AsObject(y);
+        PyObject *result = PyNumber_TrueDivide(xo, yo);
+        if (result == NULL) {
+            return CPY_FLOAT_ERROR;
+        }
+        return PyFloat_AsDouble(result);
+    }
+    return 1.0;
+}
+
+// int.bit_length()
+CPyTagged CPyTagged_BitLength(CPyTagged self) {
+    // Handle zero
+    if (self == 0) {
+        return 0;
+    }
+
+    // Fast path for small (tagged) ints
+    if (CPyTagged_CheckShort(self)) {
+        Py_ssize_t val = CPyTagged_ShortAsSsize_t(self);
+        Py_ssize_t absval = val < 0 ? -val : val;
+        int bits = 0;
+        if (absval) {
+#if defined(_MSC_VER)
+    #if defined(_WIN64)
+            unsigned long idx;
+            if (_BitScanReverse64(&idx, (unsigned __int64)absval)) {
+                bits = (int)(idx + 1);
+            }
+    #else
+            unsigned long idx;
+            if (_BitScanReverse(&idx, (unsigned long)absval)) {
+                bits = (int)(idx + 1);
+            }
+    #endif
+#elif defined(__GNUC__) || defined(__clang__)
+            bits = (int)(CPY_BITS - CPY_CLZ(absval));
+#else
+            // Fallback to loop if no builtin
+            while (absval) {
+                absval >>= 1;
+                bits++;
+            }
+#endif
+        }
+        return bits << 1;
+    }
+
+    // Slow path for big ints
+    PyObject *pyint = CPyTagged_AsObject(self);
+    int bits = _PyLong_NumBits(pyint);
+    Py_DECREF(pyint);
+    if (bits < 0) {
+        // _PyLong_NumBits sets an error on failure
+        return CPY_INT_TAG;
+    }
+    return bits << 1;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/librt_base64.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/librt_base64.c
@@ -0,0 +1,311 @@
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+#include <stdbool.h>
+#include "librt_base64.h"
+#include "libbase64.h"
+#include "pythoncapi_compat.h"
+
+#ifdef MYPYC_EXPERIMENTAL
+
+static PyObject *
+b64decode_handle_invalid_input(
+    PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen);
+
+#define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)
+
+#define STACK_BUFFER_SIZE 1024
+
+static PyObject *
+b64encode_internal(PyObject *obj) {
+    unsigned char *ascii_data;
+    char *bin_data;
+    int leftbits = 0;
+    unsigned char this_ch;
+    unsigned int leftchar = 0;
+    Py_ssize_t bin_len, out_len;
+    PyBytesWriter *writer;
+    int newline = 0; // TODO
+
+    if (!PyBytes_Check(obj)) {
+        PyErr_SetString(PyExc_TypeError, "base64() expects a bytes object");
+        return NULL;
+    }
+
+    bin_data = PyBytes_AS_STRING(obj);
+    bin_len = PyBytes_GET_SIZE(obj);
+    assert(bin_len >= 0);
+
+    if (bin_len > BASE64_MAXBIN) {
+        PyErr_SetString(PyExc_ValueError, "Too much data for base64 line");
+        return NULL;
+    }
+
+    Py_ssize_t buflen = 4 * bin_len / 3 + 4;
+    char *buf;
+    char stack_buf[STACK_BUFFER_SIZE];
+    if (buflen <= STACK_BUFFER_SIZE) {
+        buf = stack_buf;
+    } else {
+        buf = PyMem_Malloc(buflen);
+        if (buf == NULL) {
+            return PyErr_NoMemory();
+        }
+    }
+    size_t actual_len;
+    base64_encode(bin_data, bin_len, buf, &actual_len, 0);
+    PyObject *res = PyBytes_FromStringAndSize(buf, actual_len);
+    if (buflen > STACK_BUFFER_SIZE)
+        PyMem_Free(buf);
+    return res;
+}
+
+static PyObject*
+b64encode(PyObject *self, PyObject *const *args, size_t nargs) {
+    if (nargs != 1) {
+        PyErr_SetString(PyExc_TypeError, "b64encode() takes exactly one argument");
+        return 0;
+    }
+    return b64encode_internal(args[0]);
+}
+
+static inline int
+is_valid_base64_char(char c, bool allow_padding) {
+    return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
+            (c >= '0' && c <= '9') || (c == '+') || (c == '/') || (allow_padding && c == '='));
+}
+
+static PyObject *
+b64decode_internal(PyObject *arg) {
+    const char *src;
+    Py_ssize_t srclen_ssz;
+
+    // Get input pointer and length
+    if (PyBytes_Check(arg)) {
+        src = PyBytes_AS_STRING(arg);
+        srclen_ssz = PyBytes_GET_SIZE(arg);
+    } else if (PyUnicode_Check(arg)) {
+        if (!PyUnicode_IS_ASCII(arg)) {
+            PyErr_SetString(PyExc_ValueError,
+                            "string argument should contain only ASCII characters");
+            return NULL;
+        }
+        src = (const char *)PyUnicode_1BYTE_DATA(arg);
+        srclen_ssz = PyUnicode_GET_LENGTH(arg);
+    } else {
+        PyErr_SetString(PyExc_TypeError,
+                        "argument should be a bytes-like object or ASCII string");
+        return NULL;
+    }
+
+    // Fast-path: empty input
+    if (srclen_ssz == 0) {
+        return PyBytes_FromStringAndSize(NULL, 0);
+    }
+
+    // Quickly ignore invalid characters at the end. Other invalid characters
+    // are also accepted, but they need a slow path.
+    while (srclen_ssz > 0 && !is_valid_base64_char(src[srclen_ssz - 1], true)) {
+        srclen_ssz--;
+    }
+
+    // Compute an output capacity that's at least 3/4 of input, without overflow:
+    // ceil(3/4 * N) == N - floor(N/4)
+    size_t srclen = (size_t)srclen_ssz;
+    size_t max_out = srclen - (srclen / 4);
+    if (max_out == 0) {
+        max_out = 1; // defensive (srclen > 0 implies >= 1 anyway)
+    }
+    if (max_out > (size_t)PY_SSIZE_T_MAX) {
+        PyErr_SetString(PyExc_OverflowError, "input too large");
+        return NULL;
+    }
+
+    // Allocate output bytes (uninitialized) of the max capacity
+    PyObject *out_bytes = PyBytes_FromStringAndSize(NULL, (Py_ssize_t)max_out);
+    if (out_bytes == NULL) {
+        return NULL; // Propagate memory error
+    }
+
+    char *outbuf = PyBytes_AS_STRING(out_bytes);
+    size_t outlen = max_out;
+
+    int ret = base64_decode(src, srclen, outbuf, &outlen, 0);
+
+    if (ret != 1) {
+        if (ret == 0) {
+            // Slow path: handle non-base64 input
+            return b64decode_handle_invalid_input(out_bytes, outbuf, max_out, src, srclen);
+        }
+        Py_DECREF(out_bytes);
+        if (ret == -1) {
+            PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build");
+        } else {
+            PyErr_SetString(PyExc_RuntimeError, "base64_decode failed");
+        }
+        return NULL;
+    }
+
+    // Sanity-check contract (decoder must not overflow our buffer)
+    if (outlen > max_out) {
+        Py_DECREF(out_bytes);
+        PyErr_SetString(PyExc_RuntimeError, "decoder wrote past output buffer");
+        return NULL;
+    }
+
+    // Shrink in place to the actual decoded length
+    if (_PyBytes_Resize(&out_bytes, (Py_ssize_t)outlen) < 0) {
+        // _PyBytes_Resize sets an exception and may free the old object
+        return NULL;
+    }
+    return out_bytes;
+}
+
+// Process non-base64 input by ignoring non-base64 characters, for compatibility
+// with stdlib b64decode.
+static PyObject *
+b64decode_handle_invalid_input(
+    PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen)
+{
+    // Copy input to a temporary buffer, with non-base64 characters and extra suffix
+    // characters removed
+    size_t newbuf_len = 0;
+    char *newbuf = PyMem_Malloc(srclen);
+    if (newbuf == NULL) {
+        Py_DECREF(out_bytes);
+        return PyErr_NoMemory();
+    }
+
+    // Copy base64 characters and some padding to the new buffer
+    for (size_t i = 0; i < srclen; i++) {
+        char c = src[i];
+        if (is_valid_base64_char(c, false)) {
+            newbuf[newbuf_len++] = c;
+        } else if (c == '=') {
+            // Copy a necessary amount of padding
+            int remainder = newbuf_len % 4;
+            if (remainder == 0) {
+                // No padding needed
+                break;
+            }
+            int numpad = 4 - remainder;
+            // Check that there is at least the required amount padding (CPython ignores
+            // extra padding)
+            while (numpad > 0) {
+                if (i == srclen || src[i] != '=') {
+                    break;
+                }
+                newbuf[newbuf_len++] = '=';
+                i++;
+                numpad--;
+                // Skip non-base64 alphabet characters within padding
+                while (i < srclen && !is_valid_base64_char(src[i], true)) {
+                    i++;
+                }
+            }
+            break;
+        }
+    }
+
+    // Stdlib always performs a non-strict padding check
+    if (newbuf_len % 4 != 0) {
+        Py_DECREF(out_bytes);
+        PyMem_Free(newbuf);
+        PyErr_SetString(PyExc_ValueError, "Incorrect padding");
+        return NULL;
+    }
+
+    size_t outlen = max_out;
+    int ret = base64_decode(newbuf, newbuf_len, outbuf, &outlen, 0);
+    PyMem_Free(newbuf);
+
+    if (ret != 1) {
+        Py_DECREF(out_bytes);
+        if (ret == 0) {
+            PyErr_SetString(PyExc_ValueError, "Only base64 data is allowed");
+        }
+        if (ret == -1) {
+            PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build");
+        } else {
+            PyErr_SetString(PyExc_RuntimeError, "base64_decode failed");
+        }
+        return NULL;
+    }
+
+    // Shrink in place to the actual decoded length
+    if (_PyBytes_Resize(&out_bytes, (Py_ssize_t)outlen) < 0) {
+        // _PyBytes_Resize sets an exception and may free the old object
+        return NULL;
+    }
+    return out_bytes;
+}
+
+
+static PyObject*
+b64decode(PyObject *self, PyObject *const *args, size_t nargs) {
+    if (nargs != 1) {
+        PyErr_SetString(PyExc_TypeError, "b64decode() takes exactly one argument");
+        return 0;
+    }
+    return b64decode_internal(args[0]);
+}
+
+#endif
+
+static PyMethodDef librt_base64_module_methods[] = {
+#ifdef MYPYC_EXPERIMENTAL
+    {"b64encode", (PyCFunction)b64encode, METH_FASTCALL, PyDoc_STR("Encode bytes object using Base64.")},
+    {"b64decode", (PyCFunction)b64decode, METH_FASTCALL, PyDoc_STR("Decode a Base64 encoded bytes object or ASCII string.")},
+#endif
+    {NULL, NULL, 0, NULL}
+};
+
+static int
+base64_abi_version(void) {
+    return 0;
+}
+
+static int
+base64_api_version(void) {
+    return 0;
+}
+
+static int
+librt_base64_module_exec(PyObject *m)
+{
+#ifdef MYPYC_EXPERIMENTAL
+    // Export mypy internal C API, be careful with the order!
+    static void *base64_api[LIBRT_BASE64_API_LEN] = {
+        (void *)base64_abi_version,
+        (void *)base64_api_version,
+        (void *)b64encode_internal,
+    };
+    PyObject *c_api_object = PyCapsule_New((void *)base64_api, "librt.base64._C_API", NULL);
+    if (PyModule_Add(m, "_C_API", c_api_object) < 0) {
+        return -1;
+    }
+#endif
+    return 0;
+}
+
+static PyModuleDef_Slot librt_base64_module_slots[] = {
+    {Py_mod_exec, librt_base64_module_exec},
+#ifdef Py_MOD_GIL_NOT_USED
+    {Py_mod_gil, Py_MOD_GIL_NOT_USED},
+#endif
+    {0, NULL}
+};
+
+static PyModuleDef librt_base64_module = {
+    .m_base = PyModuleDef_HEAD_INIT,
+    .m_name = "base64",
+    .m_doc = "Fast base64 encoding and decoding optimized for mypyc",
+    .m_size = 0,
+    .m_methods = librt_base64_module_methods,
+    .m_slots = librt_base64_module_slots,
+};
+
+PyMODINIT_FUNC
+PyInit_base64(void)
+{
+    return PyModuleDef_Init(&librt_base64_module);
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/librt_base64.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/librt_base64.h
@@ -0,0 +1,60 @@
+#ifndef LIBRT_BASE64_H
+#define LIBRT_BASE64_H
+
+#ifndef MYPYC_EXPERIMENTAL
+
+static int
+import_librt_base64(void)
+{
+    // All librt.base64 features are experimental for now, so don't set up the API here
+    return 0;
+}
+
+#else  // MYPYC_EXPERIMENTAL
+
+#define LIBRT_BASE64_ABI_VERSION 0
+#define LIBRT_BASE64_API_VERSION 0
+#define LIBRT_BASE64_API_LEN 3
+
+static void *LibRTBase64_API[LIBRT_BASE64_API_LEN];
+
+#define LibRTBase64_ABIVersion (*(int (*)(void)) LibRTBase64_API[0])
+#define LibRTBase64_APIVersion (*(int (*)(void)) LibRTBase64_API[1])
+#define LibRTBase64_b64encode_internal (*(PyObject* (*)(PyObject *source)) LibRTBase64_API[2])
+
+static int
+import_librt_base64(void)
+{
+    PyObject *mod = PyImport_ImportModule("librt.base64");
+    if (mod == NULL)
+        return -1;
+    Py_DECREF(mod);  // we import just for the side effect of making the below work.
+    void *capsule = PyCapsule_Import("librt.base64._C_API", 0);
+    if (capsule == NULL)
+        return -1;
+    memcpy(LibRTBase64_API, capsule, sizeof(LibRTBase64_API));
+    if (LibRTBase64_ABIVersion() != LIBRT_BASE64_ABI_VERSION) {
+        char err[128];
+        snprintf(err, sizeof(err), "ABI version conflict for librt.base64, expected %d, found %d",
+            LIBRT_BASE64_ABI_VERSION,
+            LibRTBase64_ABIVersion()
+        );
+        PyErr_SetString(PyExc_ValueError, err);
+        return -1;
+    }
+    if (LibRTBase64_APIVersion() < LIBRT_BASE64_API_VERSION) {
+        char err[128];
+        snprintf(err, sizeof(err),
+                 "API version conflict for librt.base64, expected %d or newer, found %d (hint: upgrade librt)",
+            LIBRT_BASE64_API_VERSION,
+            LibRTBase64_APIVersion()
+        );
+        PyErr_SetString(PyExc_ValueError, err);
+        return -1;
+    }
+    return 0;
+}
+
+#endif  // MYPYC_EXPERIMENTAL
+
+#endif  // LIBRT_BASE64_H
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/librt_internal.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/librt_internal.c
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/librt_internal.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/librt_internal.h
@@ -0,0 +1,111 @@
+#ifndef LIBRT_INTERNAL_H
+#define LIBRT_INTERNAL_H
+
+// ABI version -- only an exact match is compatible. This will only be changed in
+// very exceptional cases (likely never) due to strict backward compatibility
+// requirements.
+#define LIBRT_INTERNAL_ABI_VERSION 2
+
+// API version -- more recent versions must maintain backward compatibility, i.e.
+// we can add new features but not remove or change existing features (unless
+// ABI version is changed, but see the comment above).
+ #define LIBRT_INTERNAL_API_VERSION 0
+
+// Number of functions in the capsule API. If you add a new function, also increase
+// LIBRT_INTERNAL_API_VERSION.
+#define LIBRT_INTERNAL_API_LEN 20
+
+#ifdef LIBRT_INTERNAL_MODULE
+
+static PyObject *ReadBuffer_internal(PyObject *source);
+static PyObject *WriteBuffer_internal(void);
+static PyObject *WriteBuffer_getvalue_internal(PyObject *self);
+static PyObject *ReadBuffer_internal(PyObject *source);
+static PyObject *ReadBuffer_internal_empty(void);
+static char write_bool_internal(PyObject *data, char value);
+static char read_bool_internal(PyObject *data);
+static char write_str_internal(PyObject *data, PyObject *value);
+static PyObject *read_str_internal(PyObject *data);
+static char write_float_internal(PyObject *data, double value);
+static double read_float_internal(PyObject *data);
+static char write_int_internal(PyObject *data, CPyTagged value);
+static CPyTagged read_int_internal(PyObject *data);
+static char write_tag_internal(PyObject *data, uint8_t value);
+static uint8_t read_tag_internal(PyObject *data);
+static int NativeInternal_ABI_Version(void);
+static char write_bytes_internal(PyObject *data, PyObject *value);
+static PyObject *read_bytes_internal(PyObject *data);
+static uint8_t cache_version_internal(void);
+static PyTypeObject *ReadBuffer_type_internal(void);
+static PyTypeObject *WriteBuffer_type_internal(void);
+static int NativeInternal_API_Version(void);
+
+#else
+
+static void *NativeInternal_API[LIBRT_INTERNAL_API_LEN];
+
+#define ReadBuffer_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[0])
+#define WriteBuffer_internal (*(PyObject* (*)(void)) NativeInternal_API[1])
+#define WriteBuffer_getvalue_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[2])
+#define write_bool_internal (*(char (*)(PyObject *source, char value)) NativeInternal_API[3])
+#define read_bool_internal (*(char (*)(PyObject *source)) NativeInternal_API[4])
+#define write_str_internal (*(char (*)(PyObject *source, PyObject *value)) NativeInternal_API[5])
+#define read_str_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[6])
+#define write_float_internal (*(char (*)(PyObject *source, double value)) NativeInternal_API[7])
+#define read_float_internal (*(double (*)(PyObject *source)) NativeInternal_API[8])
+#define write_int_internal (*(char (*)(PyObject *source, CPyTagged value)) NativeInternal_API[9])
+#define read_int_internal (*(CPyTagged (*)(PyObject *source)) NativeInternal_API[10])
+#define write_tag_internal (*(char (*)(PyObject *source, uint8_t value)) NativeInternal_API[11])
+#define read_tag_internal (*(uint8_t (*)(PyObject *source)) NativeInternal_API[12])
+#define NativeInternal_ABI_Version (*(int (*)(void)) NativeInternal_API[13])
+#define write_bytes_internal (*(char (*)(PyObject *source, PyObject *value)) NativeInternal_API[14])
+#define read_bytes_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[15])
+#define cache_version_internal (*(uint8_t (*)(void)) NativeInternal_API[16])
+#define ReadBuffer_type_internal (*(PyTypeObject* (*)(void)) NativeInternal_API[17])
+#define WriteBuffer_type_internal (*(PyTypeObject* (*)(void)) NativeInternal_API[18])
+#define NativeInternal_API_Version (*(int (*)(void)) NativeInternal_API[19])
+
+static int
+import_librt_internal(void)
+{
+    PyObject *mod = PyImport_ImportModule("librt.internal");
+    if (mod == NULL)
+        return -1;
+    Py_DECREF(mod);  // we import just for the side effect of making the below work.
+    void *capsule = PyCapsule_Import("librt.internal._C_API", 0);
+    if (capsule == NULL)
+        return -1;
+    memcpy(NativeInternal_API, capsule, sizeof(NativeInternal_API));
+    if (NativeInternal_ABI_Version() != LIBRT_INTERNAL_ABI_VERSION) {
+        char err[128];
+        snprintf(err, sizeof(err), "ABI version conflict for librt.internal, expected %d, found %d",
+            LIBRT_INTERNAL_ABI_VERSION,
+            NativeInternal_ABI_Version()
+        );
+        PyErr_SetString(PyExc_ValueError, err);
+        return -1;
+    }
+    if (NativeInternal_API_Version() < LIBRT_INTERNAL_API_VERSION) {
+        char err[128];
+        snprintf(err, sizeof(err),
+                 "API version conflict for librt.internal, expected %d or newer, found %d (hint: upgrade librt)",
+            LIBRT_INTERNAL_API_VERSION,
+            NativeInternal_API_Version()
+        );
+        PyErr_SetString(PyExc_ValueError, err);
+        return -1;
+    }
+    return 0;
+}
+
+#endif
+
+static inline bool CPyReadBuffer_Check(PyObject *obj) {
+    return Py_TYPE(obj) == ReadBuffer_type_internal();
+}
+
+static inline bool CPyWriteBuffer_Check(PyObject *obj) {
+    return Py_TYPE(obj) == WriteBuffer_type_internal();
+}
+
+#endif  // LIBRT_INTERNAL_H
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/list_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/list_ops.c
@@ -0,0 +1,406 @@
+// List primitive operations
+//
+// These are registered in mypyc.primitives.list_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+#ifndef Py_TPFLAGS_SEQUENCE
+#define Py_TPFLAGS_SEQUENCE (1 << 5)
+#endif
+
+PyObject *CPyList_Build(Py_ssize_t len, ...) {
+    Py_ssize_t i;
+
+    PyObject *res = PyList_New(len);
+    if (res == NULL) {
+        return NULL;
+    }
+
+    va_list args;
+    va_start(args, len);
+    for (i = 0; i < len; i++) {
+        // Steals the reference
+        PyObject *value = va_arg(args, PyObject *);
+        PyList_SET_ITEM(res, i, value);
+    }
+    va_end(args);
+
+    return res;
+}
+
+char CPyList_Clear(PyObject *list) {
+    if (PyList_CheckExact(list)) {
+        PyList_Clear(list);
+    } else {
+        _Py_IDENTIFIER(clear);
+        PyObject *name = _PyUnicode_FromId(&PyId_clear);
+        if (name == NULL) {
+            return 0;
+        }
+        PyObject *res = PyObject_CallMethodNoArgs(list, name);
+        if (res == NULL) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+PyObject *CPyList_Copy(PyObject *list) {
+    if(PyList_CheckExact(list)) {
+        return PyList_GetSlice(list, 0, PyList_GET_SIZE(list));
+    }
+    _Py_IDENTIFIER(copy);
+
+    PyObject *name = _PyUnicode_FromId(&PyId_copy);
+    if (name == NULL) {
+        return NULL;
+    }
+    return PyObject_CallMethodNoArgs(list, name);
+}
+
+PyObject *CPyList_GetItemShort(PyObject *list, CPyTagged index) {
+    Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+    Py_ssize_t size = PyList_GET_SIZE(list);
+    if (n >= 0) {
+        if (n >= size) {
+            PyErr_SetString(PyExc_IndexError, "list index out of range");
+            return NULL;
+        }
+    } else {
+        n += size;
+        if (n < 0) {
+            PyErr_SetString(PyExc_IndexError, "list index out of range");
+            return NULL;
+        }
+    }
+    PyObject *result = PyList_GET_ITEM(list, n);
+    Py_INCREF(result);
+    return result;
+}
+
+PyObject *CPyList_GetItemShortBorrow(PyObject *list, CPyTagged index) {
+    Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+    Py_ssize_t size = PyList_GET_SIZE(list);
+    if (n >= 0) {
+        if (n >= size) {
+            PyErr_SetString(PyExc_IndexError, "list index out of range");
+            return NULL;
+        }
+    } else {
+        n += size;
+        if (n < 0) {
+            PyErr_SetString(PyExc_IndexError, "list index out of range");
+            return NULL;
+        }
+    }
+    return PyList_GET_ITEM(list, n);
+}
+
+PyObject *CPyList_GetItem(PyObject *list, CPyTagged index) {
+    if (CPyTagged_CheckShort(index)) {
+        Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+        Py_ssize_t size = PyList_GET_SIZE(list);
+        if (n >= 0) {
+            if (n >= size) {
+                PyErr_SetString(PyExc_IndexError, "list index out of range");
+                return NULL;
+            }
+        } else {
+            n += size;
+            if (n < 0) {
+                PyErr_SetString(PyExc_IndexError, "list index out of range");
+                return NULL;
+            }
+        }
+        PyObject *result = PyList_GET_ITEM(list, n);
+        Py_INCREF(result);
+        return result;
+    } else {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return NULL;
+    }
+}
+
+PyObject *CPyList_GetItemBorrow(PyObject *list, CPyTagged index) {
+    if (CPyTagged_CheckShort(index)) {
+        Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+        Py_ssize_t size = PyList_GET_SIZE(list);
+        if (n >= 0) {
+            if (n >= size) {
+                PyErr_SetString(PyExc_IndexError, "list index out of range");
+                return NULL;
+            }
+        } else {
+            n += size;
+            if (n < 0) {
+                PyErr_SetString(PyExc_IndexError, "list index out of range");
+                return NULL;
+            }
+        }
+        return PyList_GET_ITEM(list, n);
+    } else {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return NULL;
+    }
+}
+
+PyObject *CPyList_GetItemInt64(PyObject *list, int64_t index) {
+    size_t size = PyList_GET_SIZE(list);
+    if (likely((uint64_t)index < size)) {
+        PyObject *result = PyList_GET_ITEM(list, index);
+        Py_INCREF(result);
+        return result;
+    }
+    if (index >= 0) {
+        PyErr_SetString(PyExc_IndexError, "list index out of range");
+        return NULL;
+    }
+    index += size;
+    if (index < 0) {
+        PyErr_SetString(PyExc_IndexError, "list index out of range");
+        return NULL;
+    }
+    PyObject *result = PyList_GET_ITEM(list, index);
+    Py_INCREF(result);
+    return result;
+}
+
+PyObject *CPyList_GetItemInt64Borrow(PyObject *list, int64_t index) {
+    size_t size = PyList_GET_SIZE(list);
+    if (likely((uint64_t)index < size)) {
+        return PyList_GET_ITEM(list, index);
+    }
+    if (index >= 0) {
+        PyErr_SetString(PyExc_IndexError, "list index out of range");
+        return NULL;
+    }
+    index += size;
+    if (index < 0) {
+        PyErr_SetString(PyExc_IndexError, "list index out of range");
+        return NULL;
+    }
+    return PyList_GET_ITEM(list, index);
+}
+
+bool CPyList_SetItem(PyObject *list, CPyTagged index, PyObject *value) {
+    if (CPyTagged_CheckShort(index)) {
+        Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+        Py_ssize_t size = PyList_GET_SIZE(list);
+        if (n >= 0) {
+            if (n >= size) {
+                PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
+                return false;
+            }
+        } else {
+            n += size;
+            if (n < 0) {
+                PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
+                return false;
+            }
+        }
+        // PyList_SET_ITEM doesn't decref the old element, so we do
+        Py_DECREF(PyList_GET_ITEM(list, n));
+        // N.B: Steals reference
+        PyList_SET_ITEM(list, n, value);
+        return true;
+    } else {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return false;
+    }
+}
+
+bool CPyList_SetItemInt64(PyObject *list, int64_t index, PyObject *value) {
+    size_t size = PyList_GET_SIZE(list);
+    if (unlikely((uint64_t)index >= size)) {
+        if (index > 0) {
+            PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
+            return false;
+        }
+        index += size;
+        if (index < 0) {
+            PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
+            return false;
+        }
+    }
+    // PyList_SET_ITEM doesn't decref the old element, so we do
+    Py_DECREF(PyList_GET_ITEM(list, index));
+    // N.B: Steals reference
+    PyList_SET_ITEM(list, index, value);
+    return true;
+}
+
+// This function should only be used to fill in brand new lists.
+void CPyList_SetItemUnsafe(PyObject *list, Py_ssize_t index, PyObject *value) {
+    PyList_SET_ITEM(list, index, value);
+}
+
+#ifdef Py_GIL_DISABLED
+// The original optimized list.pop implementation doesn't work on free-threaded
+// builds, so provide an alternative that is a bit slower but works.
+//
+// Note that this implementation isn't intended to be atomic.
+static inline PyObject *list_pop_index(PyObject *list, Py_ssize_t index) {
+    PyObject *item = PyList_GetItemRef(list, index);
+    if (item == NULL) {
+        return NULL;
+    }
+    if (PySequence_DelItem(list, index) < 0) {
+        Py_DECREF(item);
+        return NULL;
+    }
+    return item;
+}
+#endif
+
+PyObject *CPyList_PopLast(PyObject *list)
+{
+#ifdef Py_GIL_DISABLED
+    // The other implementation causes segfaults on a free-threaded Python 3.14b4 build.
+    Py_ssize_t index = PyList_GET_SIZE(list) - 1;
+    return list_pop_index(list, index);
+#else
+    // I tried a specalized version of pop_impl for just removing the
+    // last element and it wasn't any faster in microbenchmarks than
+    // the generic one so I ditched it.
+    return list_pop_impl((PyListObject *)list, -1);
+#endif
+}
+
+PyObject *CPyList_Pop(PyObject *obj, CPyTagged index)
+{
+    if (CPyTagged_CheckShort(index)) {
+        Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+#ifdef Py_GIL_DISABLED
+        // We must use a slower implementation on free-threaded builds.
+        if (n < 0) {
+            n += PyList_GET_SIZE(obj);
+        }
+        return list_pop_index(obj, n);
+#else
+        return list_pop_impl((PyListObject *)obj, n);
+#endif
+    } else {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return NULL;
+    }
+}
+
+CPyTagged CPyList_Count(PyObject *obj, PyObject *value)
+{
+    return list_count((PyListObject *)obj, value);
+}
+
+int CPyList_Insert(PyObject *list, CPyTagged index, PyObject *value)
+{
+    if (CPyTagged_CheckShort(index)) {
+        Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+        return PyList_Insert(list, n, value);
+    }
+    // The max range doesn't exactly coincide with ssize_t, but we still
+    // want to keep the error message compatible with CPython.
+    PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+    return -1;
+}
+
+PyObject *CPyList_Extend(PyObject *o1, PyObject *o2) {
+    if (PyList_Extend(o1, o2) < 0) {
+        return NULL;
+    }
+    Py_RETURN_NONE;
+}
+
+// Return -2 or error, -1 if not found, or index of first match otherwise.
+static Py_ssize_t _CPyList_Find(PyObject *list, PyObject *obj) {
+    Py_ssize_t i;
+    for (i = 0; i < Py_SIZE(list); i++) {
+        PyObject *item = PyList_GET_ITEM(list, i);
+        Py_INCREF(item);
+        int cmp = PyObject_RichCompareBool(item, obj, Py_EQ);
+        Py_DECREF(item);
+        if (cmp != 0) {
+            if (cmp > 0) {
+                return i;
+            } else {
+                return -2;
+            }
+        }
+    }
+    return -1;
+}
+
+int CPyList_Remove(PyObject *list, PyObject *obj) {
+    Py_ssize_t index = _CPyList_Find(list, obj);
+    if (index == -2) {
+        return -1;
+    }
+    if (index == -1) {
+        PyErr_SetString(PyExc_ValueError, "list.remove(x): x not in list");
+        return -1;
+    }
+    return PyList_SetSlice(list, index, index + 1, NULL);
+}
+
+CPyTagged CPyList_Index(PyObject *list, PyObject *obj) {
+    Py_ssize_t index = _CPyList_Find(list, obj);
+    if (index == -2) {
+        return CPY_INT_TAG;
+    }
+    if (index == -1) {
+        PyErr_SetString(PyExc_ValueError, "value is not in list");
+        return CPY_INT_TAG;
+    }
+    return index << 1;
+}
+
+PyObject *CPySequence_Sort(PyObject *seq) {
+    PyObject *newlist = PySequence_List(seq);
+    if (newlist == NULL)
+        return NULL;
+    int res = PyList_Sort(newlist);
+    if (res < 0) {
+        Py_DECREF(newlist);
+        return NULL;
+    }
+    return newlist;
+}
+
+PyObject *CPySequence_Multiply(PyObject *seq, CPyTagged t_size) {
+    Py_ssize_t size = CPyTagged_AsSsize_t(t_size);
+    if (size == -1 && PyErr_Occurred()) {
+        return NULL;
+    }
+    return PySequence_Repeat(seq, size);
+}
+
+PyObject *CPySequence_RMultiply(CPyTagged t_size, PyObject *seq) {
+    return CPySequence_Multiply(seq, t_size);
+}
+
+PyObject *CPySequence_InPlaceMultiply(PyObject *seq, CPyTagged t_size) {
+    Py_ssize_t size = CPyTagged_AsSsize_t(t_size);
+    if (size == -1 && PyErr_Occurred()) {
+        return NULL;
+    }
+    return PySequence_InPlaceRepeat(seq, size);
+}
+
+PyObject *CPyList_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
+    if (likely(PyList_CheckExact(obj)
+               && CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
+        Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
+        Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
+        if (startn < 0) {
+            startn += PyList_GET_SIZE(obj);
+        }
+        if (endn < 0) {
+            endn += PyList_GET_SIZE(obj);
+        }
+        return PyList_GetSlice(obj, startn, endn);
+    }
+    return CPyObject_GetSlice(obj, start, end);
+}
+
+int CPySequence_Check(PyObject *obj) {
+    return Py_TYPE(obj)->tp_flags & Py_TPFLAGS_SEQUENCE;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/misc_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/misc_ops.c
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/module_shim.tmpl
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/module_shim.tmpl
@@ -0,0 +1,21 @@
+#include <Python.h>
+
+PyMODINIT_FUNC
+PyInit_{modname}(void)
+{{
+    PyObject *tmp;
+    if (!(tmp = PyImport_ImportModule("{libname}"))) return NULL;
+    PyObject *capsule = PyObject_GetAttrString(tmp, "init_{full_modname}");
+    Py_DECREF(tmp);
+    if (capsule == NULL) return NULL;
+    void *init_func = PyCapsule_GetPointer(capsule, "{libname}.init_{full_modname}");
+    Py_DECREF(capsule);
+    if (!init_func) {{
+        return NULL;
+    }}
+    return ((PyObject *(*)(void))init_func)();
+}}
+
+// distutils sometimes spuriously tells cl to export CPyInit___init__,
+// so provide that so it chills out
+PyMODINIT_FUNC PyInit___init__(void) {{ return PyInit_{modname}(); }}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/module_shim_no_gil_multiphase.tmpl
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/module_shim_no_gil_multiphase.tmpl
@@ -0,0 +1,41 @@
+#include <Python.h>
+
+static int {modname}_exec(PyObject *module)
+{{
+    PyObject *tmp;
+    if (!(tmp = PyImport_ImportModule("{libname}"))) return -1;
+    PyObject *capsule = PyObject_GetAttrString(tmp, "exec_{full_modname}");
+    Py_DECREF(tmp);
+    if (capsule == NULL) return -1;
+    void *exec_func = PyCapsule_GetPointer(capsule, "{libname}.exec_{full_modname}");
+    Py_DECREF(capsule);
+    if (!exec_func) return -1;
+    if (((int (*)(PyObject *))exec_func)(module) != 0) return -1;
+    return 0;
+}}
+
+static PyModuleDef_Slot {modname}_slots[] = {{
+    {{Py_mod_exec, {modname}_exec}},
+    {{Py_mod_multiple_interpreters, Py_MOD_MULTIPLE_INTERPRETERS_NOT_SUPPORTED}},
+    {{Py_mod_gil, Py_MOD_GIL_NOT_USED}},
+    {{0, NULL}},
+}};
+
+static struct PyModuleDef {modname}_module = {{
+    PyModuleDef_HEAD_INIT,
+    .m_name = "{modname}",
+    .m_doc = NULL,
+    .m_methods = NULL,
+    .m_size = 0,
+    .m_slots = {modname}_slots,
+}};
+
+PyMODINIT_FUNC
+PyInit_{modname}(void)
+{{
+    return PyModuleDef_Init(&{modname}_module);
+}}
+
+// distutils sometimes spuriously tells cl to export CPyInit___init__,
+// so provide that so it chills out
+PyMODINIT_FUNC PyInit___init__(void) {{ return PyInit_{modname}(); }}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/mypyc_util.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/mypyc_util.h
@@ -0,0 +1,182 @@
+#ifndef MYPYC_UTIL_H
+#define MYPYC_UTIL_H
+
+#include <Python.h>
+#include <frameobject.h>
+#include <assert.h>
+
+#if defined(__clang__) || defined(__GNUC__)
+#define likely(x)       __builtin_expect((x),1)
+#define unlikely(x)     __builtin_expect((x),0)
+#define CPy_Unreachable() __builtin_unreachable()
+#else
+#define likely(x)       (x)
+#define unlikely(x)     (x)
+#define CPy_Unreachable() abort()
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#define CPy_NOINLINE __attribute__((noinline))
+#elif defined(_MSC_VER)
+#define CPy_NOINLINE __declspec(noinline)
+#else
+#define CPy_NOINLINE
+#endif
+
+#ifndef Py_GIL_DISABLED
+
+// Everything is running in the same thread, so no need for thread locals
+#define CPyThreadLocal
+
+#else
+
+// 1. Use C11 standard thread_local storage, if available
+#if defined(__STDC_VERSION__)  && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
+#define CPyThreadLocal _Thread_local
+
+// 2. Microsoft Visual Studio fallback
+#elif defined(_MSC_VER)
+#define CPyThreadLocal __declspec(thread)
+
+// 3. GNU thread local storage for GCC/Clang targets that still need it
+#elif defined(__GNUC__) || defined(__clang__)
+#define CPyThreadLocal __thread
+
+#else
+#error "Can't define CPyThreadLocal for this compiler/target (consider using a non-free-threaded Python build)"
+#endif
+
+#endif // Py_GIL_DISABLED
+
+// INCREF and DECREF that assert the pointer is not NULL.
+// asserts are disabled in release builds so there shouldn't be a perf hit.
+// I'm honestly kind of surprised that this isn't done by default.
+#define CPy_INCREF(p) do { assert(p); Py_INCREF(p); } while (0)
+#define CPy_DECREF(p) do { assert(p); Py_DECREF(p); } while (0)
+// Here just for consistency
+#define CPy_XDECREF(p) Py_XDECREF(p)
+
+#ifndef Py_GIL_DISABLED
+
+// The *_NO_IMM operations below perform refcount manipulation for
+// non-immortal objects (Python 3.12 and later).
+//
+// Py_INCREF and other CPython operations check for immortality. This
+// can be expensive when we know that an object cannot be immortal.
+//
+// This optimization cannot be performed in free-threaded mode so we
+// fall back to just calling the normal incref/decref operations.
+
+static inline void CPy_INCREF_NO_IMM(PyObject *op)
+{
+    op->ob_refcnt++;
+}
+
+static inline void CPy_DECREF_NO_IMM(PyObject *op)
+{
+    if (--op->ob_refcnt == 0) {
+        _Py_Dealloc(op);
+    }
+}
+
+static inline void CPy_XDECREF_NO_IMM(PyObject *op)
+{
+    if (op != NULL && --op->ob_refcnt == 0) {
+        _Py_Dealloc(op);
+    }
+}
+
+#define CPy_INCREF_NO_IMM(op) CPy_INCREF_NO_IMM((PyObject *)(op))
+#define CPy_DECREF_NO_IMM(op) CPy_DECREF_NO_IMM((PyObject *)(op))
+#define CPy_XDECREF_NO_IMM(op) CPy_XDECREF_NO_IMM((PyObject *)(op))
+
+#else
+
+#define CPy_INCREF_NO_IMM(op) CPy_INCREF(op)
+#define CPy_DECREF_NO_IMM(op) CPy_DECREF(op)
+#define CPy_XDECREF_NO_IMM(op) CPy_XDECREF(op)
+
+#endif
+
+// Tagged integer -- our representation of Python 'int' objects.
+// Small enough integers are represented as unboxed integers (shifted
+// left by 1); larger integers (larger than 63 bits on a 64-bit
+// platform) are stored as a tagged pointer (PyObject *)
+// representing a Python int object, with the lowest bit set.
+// Tagged integers are always normalized. A small integer *must not*
+// have the tag bit set.
+typedef size_t CPyTagged;
+
+typedef size_t CPyPtr;
+
+#define CPY_INT_BITS (CHAR_BIT * sizeof(CPyTagged))
+
+#define CPY_TAGGED_MAX (((Py_ssize_t)1 << (CPY_INT_BITS - 2)) - 1)
+#define CPY_TAGGED_MIN (-((Py_ssize_t)1 << (CPY_INT_BITS - 2)))
+#define CPY_TAGGED_ABS_MIN (0-(size_t)CPY_TAGGED_MIN)
+
+typedef PyObject CPyModule;
+
+// Tag bit used for long integers
+#define CPY_INT_TAG 1
+
+// Error value for signed fixed-width (low-level) integers
+#define CPY_LL_INT_ERROR -113
+
+// Error value for unsigned fixed-width (low-level) integers
+#define CPY_LL_UINT_ERROR 239
+
+// Error value for floats
+#define CPY_FLOAT_ERROR -113.0
+
+typedef void (*CPyVTableItem)(void);
+
+static inline CPyTagged CPyTagged_ShortFromInt(int x) {
+    return x << 1;
+}
+
+static inline CPyTagged CPyTagged_ShortFromSsize_t(Py_ssize_t x) {
+    return x << 1;
+}
+
+// Are we targeting Python 3.X or newer?
+#define CPY_3_11_FEATURES (PY_VERSION_HEX >= 0x030b0000)
+#define CPY_3_12_FEATURES (PY_VERSION_HEX >= 0x030c0000)
+#define CPY_3_14_FEATURES (PY_VERSION_HEX >= 0x030e0000)
+
+#if CPY_3_12_FEATURES
+
+// Same as macros in CPython internal/pycore_long.h, but with a CPY_ prefix
+#define CPY_NON_SIZE_BITS 3
+#define CPY_SIGN_ZERO 1
+#define CPY_SIGN_NEGATIVE 2
+#define CPY_SIGN_MASK 3
+
+#define CPY_LONG_DIGIT(o, n) ((o)->long_value.ob_digit[n])
+
+// Only available on Python 3.12 and later
+#define CPY_LONG_TAG(o) ((o)->long_value.lv_tag)
+#define CPY_LONG_IS_NEGATIVE(o) (((o)->long_value.lv_tag & CPY_SIGN_MASK) == CPY_SIGN_NEGATIVE)
+// Only available on Python 3.12 and later
+#define CPY_LONG_SIZE(o) ((o)->long_value.lv_tag >> CPY_NON_SIZE_BITS)
+// Number of digits; negative for negative ints
+#define CPY_LONG_SIZE_SIGNED(o) (CPY_LONG_IS_NEGATIVE(o) ? -CPY_LONG_SIZE(o) : CPY_LONG_SIZE(o))
+// Number of digits, assuming int is non-negative
+#define CPY_LONG_SIZE_UNSIGNED(o) CPY_LONG_SIZE(o)
+
+#else
+
+#define CPY_LONG_DIGIT(o, n) ((o)->ob_digit[n])
+#define CPY_LONG_IS_NEGATIVE(o) (((o)->ob_base.ob_size < 0)
+#define CPY_LONG_SIZE_SIGNED(o) ((o)->ob_base.ob_size)
+#define CPY_LONG_SIZE_UNSIGNED(o) ((o)->ob_base.ob_size)
+
+#endif
+
+// Are we targeting Python 3.13 or newer?
+#define CPY_3_13_FEATURES (PY_VERSION_HEX >= 0x030d0000)
+
+// Are we targeting Python 3.14 or newer?
+#define CPY_3_14_FEATURES (PY_VERSION_HEX >= 0x030e0000)
+
+#endif
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/pythoncapi_compat.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/pythoncapi_compat.h
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/pythonsupport.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/pythonsupport.c
@@ -0,0 +1,106 @@
+// Collects code that was copied in from cpython, for a couple of different reasons:
+//  * We wanted to modify it to produce a more efficient version for our uses
+//  * We needed to call it and it was static :(
+//  * We wanted to call it and needed to backport it
+
+#include "pythonsupport.h"
+
+#if CPY_3_12_FEATURES
+
+// Slow path of CPyLong_AsSsize_tAndOverflow (non-inlined)
+Py_ssize_t
+CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow)
+{
+    PyLongObject *v = (PyLongObject *)vv;
+    size_t x, prev;
+    Py_ssize_t res;
+    Py_ssize_t i;
+    int sign;
+
+    *overflow = 0;
+
+    res = -1;
+    i = CPY_LONG_TAG(v);
+
+    sign = 1;
+    x = 0;
+    if (i & CPY_SIGN_NEGATIVE) {
+        sign = -1;
+    }
+    i >>= CPY_NON_SIZE_BITS;
+    while (--i >= 0) {
+        prev = x;
+        x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i);
+        if ((x >> PyLong_SHIFT) != prev) {
+            *overflow = sign;
+            goto exit;
+        }
+    }
+    /* Haven't lost any bits, but casting to long requires extra
+     * care.
+     */
+    if (x <= (size_t)CPY_TAGGED_MAX) {
+        res = (Py_ssize_t)x * sign;
+    }
+    else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) {
+        res = CPY_TAGGED_MIN;
+    }
+    else {
+        *overflow = sign;
+        /* res is already set to -1 */
+    }
+  exit:
+    return res;
+}
+
+#else
+
+// Slow path of CPyLong_AsSsize_tAndOverflow (non-inlined, Python 3.11 and earlier)
+Py_ssize_t
+CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow)
+{
+    /* This version by Tim Peters */
+    PyLongObject *v = (PyLongObject *)vv;
+    size_t x, prev;
+    Py_ssize_t res;
+    Py_ssize_t i;
+    int sign;
+
+    *overflow = 0;
+
+    res = -1;
+    i = Py_SIZE(v);
+
+    sign = 1;
+    x = 0;
+    if (i < 0) {
+        sign = -1;
+        i = -(i);
+    }
+    while (--i >= 0) {
+        prev = x;
+        x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i);
+        if ((x >> PyLong_SHIFT) != prev) {
+            *overflow = sign;
+            goto exit;
+        }
+    }
+    /* Haven't lost any bits, but casting to long requires extra
+     * care.
+     */
+    if (x <= (size_t)CPY_TAGGED_MAX) {
+        res = (Py_ssize_t)x * sign;
+    }
+    else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) {
+        res = CPY_TAGGED_MIN;
+    }
+    else {
+        *overflow = sign;
+        /* res is already set to -1 */
+    }
+  exit:
+    return res;
+}
+
+
+#endif
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/pythonsupport.h
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/pythonsupport.h
@@ -0,0 +1,478 @@
+// Collects code that was copied in from cpython, for a couple of different reasons:
+//  * We wanted to modify it to produce a more efficient version for our uses
+//  * We needed to call it and it was static :(
+//  * We wanted to call it and needed to backport it
+
+#ifndef CPY_PYTHONSUPPORT_H
+#define CPY_PYTHONSUPPORT_H
+
+#include <stdbool.h>
+#include <Python.h>
+#include "pythoncapi_compat.h"
+#include <frameobject.h>
+#include <assert.h>
+#include "mypyc_util.h"
+
+#if CPY_3_13_FEATURES
+#ifndef Py_BUILD_CORE
+#define Py_BUILD_CORE
+#endif
+#include "internal/pycore_genobject.h"  // _PyGen_FetchStopIterationValue
+#include "internal/pycore_pyerrors.h"  // _PyErr_FormatFromCause, _PyErr_SetKeyError
+#include "internal/pycore_setobject.h"  // _PySet_Update
+#endif
+
+#if CPY_3_12_FEATURES
+#include "internal/pycore_frame.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#if 0
+} // why isn't emacs smart enough to not indent this
+#endif
+
+/////////////////////////////////////////
+// Adapted from bltinmodule.c in Python 3.7.0
+_Py_IDENTIFIER(__mro_entries__);
+static PyObject*
+update_bases(PyObject *bases)
+{
+    Py_ssize_t i, j;
+    PyObject *base, *meth, *new_base, *result, *new_bases = NULL;
+    PyObject *stack[1] = {bases};
+    assert(PyTuple_Check(bases));
+
+    Py_ssize_t nargs = PyTuple_GET_SIZE(bases);
+    for (i = 0; i < nargs; i++) {
+        base = PyTuple_GET_ITEM(bases, i);
+        if (PyType_Check(base)) {
+            if (new_bases) {
+                /* If we already have made a replacement, then we append every normal base,
+                   otherwise just skip it. */
+                if (PyList_Append(new_bases, base) < 0) {
+                    goto error;
+                }
+            }
+            continue;
+        }
+        if (PyObject_GetOptionalAttrString(base, PyId___mro_entries__.string, &meth) < 0) {
+            goto error;
+        }
+        if (!meth) {
+            if (new_bases) {
+                if (PyList_Append(new_bases, base) < 0) {
+                    goto error;
+                }
+            }
+            continue;
+        }
+        new_base = PyObject_Vectorcall(meth, stack, 1, NULL);
+        Py_DECREF(meth);
+        if (!new_base) {
+            goto error;
+        }
+        if (!PyTuple_Check(new_base)) {
+            PyErr_SetString(PyExc_TypeError,
+                            "__mro_entries__ must return a tuple");
+            Py_DECREF(new_base);
+            goto error;
+        }
+        if (!new_bases) {
+            /* If this is a first successful replacement, create new_bases list and
+               copy previously encountered bases. */
+            if (!(new_bases = PyList_New(i))) {
+                goto error;
+            }
+            for (j = 0; j < i; j++) {
+                base = PyTuple_GET_ITEM(bases, j);
+                PyList_SET_ITEM(new_bases, j, base);
+                Py_INCREF(base);
+            }
+        }
+        j = PyList_GET_SIZE(new_bases);
+        if (PyList_SetSlice(new_bases, j, j, new_base) < 0) {
+            goto error;
+        }
+        Py_DECREF(new_base);
+    }
+    if (!new_bases) {
+        return bases;
+    }
+    result = PyList_AsTuple(new_bases);
+    Py_DECREF(new_bases);
+    return result;
+
+error:
+    Py_XDECREF(new_bases);
+    return NULL;
+}
+
+// From Python 3.7's typeobject.c
+_Py_IDENTIFIER(__init_subclass__);
+static int
+init_subclass(PyTypeObject *type, PyObject *kwds)
+{
+    PyObject *super, *func, *result;
+    PyObject *args[2] = {(PyObject *)type, (PyObject *)type};
+
+    super = PyObject_Vectorcall((PyObject *)&PySuper_Type, args, 2, NULL);
+    if (super == NULL) {
+        return -1;
+    }
+
+    func = _PyObject_GetAttrId(super, &PyId___init_subclass__);
+    Py_DECREF(super);
+    if (func == NULL) {
+        return -1;
+    }
+
+    result = _PyObject_FastCallDict(func, NULL, 0, kwds);
+    Py_DECREF(func);
+    if (result == NULL) {
+        return -1;
+    }
+
+    Py_DECREF(result);
+    return 0;
+}
+
+Py_ssize_t
+CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow);
+
+#if CPY_3_12_FEATURES
+
+static inline Py_ssize_t
+CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow)
+{
+    /* This version by Tim Peters */
+    PyLongObject *v = (PyLongObject *)vv;
+    Py_ssize_t res;
+    Py_ssize_t i;
+
+    *overflow = 0;
+
+    res = -1;
+    i = CPY_LONG_TAG(v);
+
+    // TODO: Combine zero and non-zero cases helow?
+    if (likely(i == (1 << CPY_NON_SIZE_BITS))) {
+        res = CPY_LONG_DIGIT(v, 0);
+    } else if (likely(i == CPY_SIGN_ZERO)) {
+        res = 0;
+    } else if (i == ((1 << CPY_NON_SIZE_BITS) | CPY_SIGN_NEGATIVE)) {
+        res = -(sdigit)CPY_LONG_DIGIT(v, 0);
+    } else {
+        // Slow path is moved to a non-inline helper function to
+        // limit size of generated code
+        int overflow_local;
+        res = CPyLong_AsSsize_tAndOverflow_(vv, &overflow_local);
+        *overflow = overflow_local;
+    }
+    return res;
+}
+
+#else
+
+// Adapted from longobject.c in Python 3.7.0
+
+/* This function adapted from PyLong_AsLongLongAndOverflow, but with
+ * some safety checks removed and specialized to only work for objects
+ * that are already longs.
+ * About half of the win this provides, though, just comes from being
+ * able to inline the function, which in addition to saving function call
+ * overhead allows the out-parameter overflow flag to be collapsed into
+ * control flow.
+ * Additionally, we check against the possible range of CPyTagged, not of
+ * Py_ssize_t. */
+static inline Py_ssize_t
+CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow)
+{
+    /* This version by Tim Peters */
+    PyLongObject *v = (PyLongObject *)vv;
+    Py_ssize_t res;
+    Py_ssize_t i;
+
+    *overflow = 0;
+
+    res = -1;
+    i = Py_SIZE(v);
+
+    if (likely(i == 1)) {
+        res = CPY_LONG_DIGIT(v, 0);
+    } else if (likely(i == 0)) {
+        res = 0;
+    } else if (i == -1) {
+        res = -(sdigit)CPY_LONG_DIGIT(v, 0);
+    } else {
+        // Slow path is moved to a non-inline helper function to
+        // limit size of generated code
+        int overflow_local;
+        res = CPyLong_AsSsize_tAndOverflow_(vv, &overflow_local);
+        *overflow = overflow_local;
+    }
+    return res;
+}
+
+#endif
+
+// Adapted from listobject.c in Python 3.7.0
+static int
+list_resize(PyListObject *self, Py_ssize_t newsize)
+{
+    PyObject **items;
+    size_t new_allocated, num_allocated_bytes;
+    Py_ssize_t allocated = self->allocated;
+
+    /* Bypass realloc() when a previous overallocation is large enough
+       to accommodate the newsize.  If the newsize falls lower than half
+       the allocated size, then proceed with the realloc() to shrink the list.
+    */
+    if (allocated >= newsize && newsize >= (allocated >> 1)) {
+        assert(self->ob_item != NULL || newsize == 0);
+        Py_SET_SIZE(self, newsize);
+        return 0;
+    }
+
+    /* This over-allocates proportional to the list size, making room
+     * for additional growth.  The over-allocation is mild, but is
+     * enough to give linear-time amortized behavior over a long
+     * sequence of appends() in the presence of a poorly-performing
+     * system realloc().
+     * The growth pattern is:  0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
+     * Note: new_allocated won't overflow because the largest possible value
+     *       is PY_SSIZE_T_MAX * (9 / 8) + 6 which always fits in a size_t.
+     */
+    new_allocated = (size_t)newsize + (newsize >> 3) + (newsize < 9 ? 3 : 6);
+    if (new_allocated > (size_t)PY_SSIZE_T_MAX / sizeof(PyObject *)) {
+        PyErr_NoMemory();
+        return -1;
+    }
+
+    if (newsize == 0)
+        new_allocated = 0;
+    num_allocated_bytes = new_allocated * sizeof(PyObject *);
+    items = (PyObject **)PyMem_Realloc(self->ob_item, num_allocated_bytes);
+    if (items == NULL) {
+        PyErr_NoMemory();
+        return -1;
+    }
+    self->ob_item = items;
+    Py_SET_SIZE(self, newsize);
+    self->allocated = new_allocated;
+    return 0;
+}
+
+// Changed to use PyList_SetSlice instead of the internal list_ass_slice
+static PyObject *
+list_pop_impl(PyListObject *self, Py_ssize_t index)
+{
+    PyObject *v;
+    int status;
+
+    if (Py_SIZE(self) == 0) {
+        /* Special-case most common failure cause */
+        PyErr_SetString(PyExc_IndexError, "pop from empty list");
+        return NULL;
+    }
+    if (index < 0)
+        index += Py_SIZE(self);
+    if (index < 0 || index >= Py_SIZE(self)) {
+        PyErr_SetString(PyExc_IndexError, "pop index out of range");
+        return NULL;
+    }
+    v = self->ob_item[index];
+    if (index == Py_SIZE(self) - 1) {
+        status = list_resize(self, Py_SIZE(self) - 1);
+        if (status >= 0)
+            return v; /* and v now owns the reference the list had */
+        else
+            return NULL;
+    }
+    Py_INCREF(v);
+    status = PyList_SetSlice((PyObject *)self, index, index+1, (PyObject *)NULL);
+    if (status < 0) {
+        Py_DECREF(v);
+        return NULL;
+    }
+    return v;
+}
+
+// Tweaked to directly use CPyTagged
+static CPyTagged
+list_count(PyListObject *self, PyObject *value)
+{
+    Py_ssize_t count = 0;
+    Py_ssize_t i;
+
+    for (i = 0; i < Py_SIZE(self); i++) {
+        int cmp = PyObject_RichCompareBool(self->ob_item[i], value, Py_EQ);
+        if (cmp > 0)
+            count++;
+        else if (cmp < 0)
+            return CPY_INT_TAG;
+    }
+    return CPyTagged_ShortFromSsize_t(count);
+}
+
+// Adapted from genobject.c in Python 3.7.2
+// Copied because it wasn't in 3.5.2 and it is undocumented anyways.
+/*
+ * Set StopIteration with specified value.  Value can be arbitrary object
+ * or NULL.
+ *
+ * Returns 0 if StopIteration is set and -1 if any other exception is set.
+ */
+static int
+CPyGen_SetStopIterationValue(PyObject *value)
+{
+    PyObject *e;
+
+    if (value == NULL ||
+        (!PyTuple_Check(value) && !PyExceptionInstance_Check(value)))
+    {
+        /* Delay exception instantiation if we can */
+        PyErr_SetObject(PyExc_StopIteration, value);
+        return 0;
+    }
+    /* Construct an exception instance manually with
+     * PyObject_CallOneArg and pass it to PyErr_SetObject.
+     *
+     * We do this to handle a situation when "value" is a tuple, in which
+     * case PyErr_SetObject would set the value of StopIteration to
+     * the first element of the tuple.
+     *
+     * (See PyErr_SetObject/_PyErr_CreateException code for details.)
+     */
+    e = PyObject_CallOneArg(PyExc_StopIteration, value);
+    if (e == NULL) {
+        return -1;
+    }
+    PyErr_SetObject(PyExc_StopIteration, e);
+    Py_DECREF(e);
+    return 0;
+}
+
+// Copied from dictobject.c and dictobject.h, these are not Public before
+// Python 3.8. Also remove some error checks that we do in the callers.
+typedef struct {
+    PyObject_HEAD
+    PyDictObject *dv_dict;
+} _CPyDictViewObject;
+
+static PyObject *
+_CPyDictView_New(PyObject *dict, PyTypeObject *type)
+{
+    _CPyDictViewObject *dv = PyObject_GC_New(_CPyDictViewObject, type);
+    if (dv == NULL)
+        return NULL;
+    Py_INCREF(dict);
+    dv->dv_dict = (PyDictObject *)dict;
+    PyObject_GC_Track(dv);
+    return (PyObject *)dv;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#if PY_VERSION_HEX >= 0x030A0000  // 3.10
+static int
+_CPyObject_HasAttrId(PyObject *v, _Py_Identifier *name) {
+    PyObject *tmp = NULL;
+    int result = PyObject_GetOptionalAttrString(v, name->string, &tmp);
+    if (tmp) {
+        Py_DECREF(tmp);
+    }
+    return result;
+}
+#else
+#define _CPyObject_HasAttrId _PyObject_HasAttrId
+#endif
+
+#if CPY_3_12_FEATURES
+
+// These are copied from genobject.c in Python 3.12
+
+static int
+gen_is_coroutine(PyObject *o)
+{
+    if (PyGen_CheckExact(o)) {
+        PyCodeObject *code = PyGen_GetCode((PyGenObject*)o);
+        if (code->co_flags & CO_ITERABLE_COROUTINE) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+#else
+
+// Copied from genobject.c in Python 3.10
+static int
+gen_is_coroutine(PyObject *o)
+{
+    if (PyGen_CheckExact(o)) {
+        PyCodeObject *code = (PyCodeObject *)((PyGenObject*)o)->gi_code;
+        if (code->co_flags & CO_ITERABLE_COROUTINE) {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+#endif
+
+/*
+ *   This helper function returns an awaitable for `o`:
+ *     - `o` if `o` is a coroutine-object;
+ *     - `type(o)->tp_as_async->am_await(o)`
+ *
+ *   Raises a TypeError if it's not possible to return
+ *   an awaitable and returns NULL.
+ */
+static PyObject *
+CPyCoro_GetAwaitableIter(PyObject *o)
+{
+    unaryfunc getter = NULL;
+    PyTypeObject *ot;
+
+    if (PyCoro_CheckExact(o) || gen_is_coroutine(o)) {
+        /* 'o' is a coroutine. */
+        Py_INCREF(o);
+        return o;
+    }
+
+    ot = Py_TYPE(o);
+    if (ot->tp_as_async != NULL) {
+        getter = ot->tp_as_async->am_await;
+    }
+    if (getter != NULL) {
+        PyObject *res = (*getter)(o);
+        if (res != NULL) {
+            if (PyCoro_CheckExact(res) || gen_is_coroutine(res)) {
+                /* __await__ must return an *iterator*, not
+                   a coroutine or another awaitable (see PEP 492) */
+                PyErr_SetString(PyExc_TypeError,
+                                "__await__() returned a coroutine");
+                Py_CLEAR(res);
+            } else if (!PyIter_Check(res)) {
+                PyErr_Format(PyExc_TypeError,
+                             "__await__() returned non-iterator "
+                             "of type '%.100s'",
+                             Py_TYPE(res)->tp_name);
+                Py_CLEAR(res);
+            }
+        }
+        return res;
+    }
+
+    PyErr_Format(PyExc_TypeError,
+                 "object %.100s can't be used in 'await' expression",
+                 ot->tp_name);
+    return NULL;
+}
+
+
+#endif
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/set_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/set_ops.c
@@ -0,0 +1,17 @@
+// Set primitive operations
+//
+// These are registered in mypyc.primitives.set_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+bool CPySet_Remove(PyObject *set, PyObject *key) {
+    int success = PySet_Discard(set, key);
+    if (success == 1) {
+        return true;
+    }
+    if (success == 0) {
+        _PyErr_SetKeyError(key);
+    }
+    return false;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/str_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/str_ops.c
@@ -0,0 +1,623 @@
+#include "pythoncapi_compat.h"
+
+// String primitive operations
+//
+// These are registered in mypyc.primitives.str_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+// The _PyUnicode_CheckConsistency definition has been moved to the internal API
+// https://github.com/python/cpython/pull/106398
+#if defined(Py_DEBUG) && defined(CPY_3_13_FEATURES)
+#include "internal/pycore_unicodeobject.h"
+#endif
+
+// Copied from cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
+#define BLOOM_MASK unsigned long
+#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
+#if LONG_BIT >= 128
+#define BLOOM_WIDTH 128
+#elif LONG_BIT >= 64
+#define BLOOM_WIDTH 64
+#elif LONG_BIT >= 32
+#define BLOOM_WIDTH 32
+#else
+#error "LONG_BIT is smaller than 32"
+#endif
+
+// Copied from cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
+// This is needed for str.strip("...").
+static inline BLOOM_MASK
+make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
+{
+#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
+    do {                                               \
+        TYPE *data = (TYPE *)PTR;                      \
+        TYPE *end = data + LEN;                        \
+        Py_UCS4 ch;                                    \
+        for (; data != end; data++) {                  \
+            ch = *data;                                \
+            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
+        }                                              \
+        break;                                         \
+    } while (0)
+
+    /* calculate simple bloom-style bitmask for a given unicode string */
+
+    BLOOM_MASK mask;
+
+    mask = 0;
+    switch (kind) {
+    case PyUnicode_1BYTE_KIND:
+        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
+        break;
+    case PyUnicode_2BYTE_KIND:
+        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
+        break;
+    case PyUnicode_4BYTE_KIND:
+        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
+        break;
+    default:
+        Py_UNREACHABLE();
+    }
+    return mask;
+
+#undef BLOOM_UPDATE
+}
+
+static inline char _CPyStr_Equal_NoIdentCheck(PyObject *str1, PyObject *str2, Py_ssize_t str2_length) {
+    // This helper function only exists to deduplicate code in CPyStr_Equal and CPyStr_EqualLiteral
+    Py_ssize_t str1_length = PyUnicode_GET_LENGTH(str1);
+    if (str1_length != str2_length)
+        return 0;
+    int kind = PyUnicode_KIND(str1);
+    if (PyUnicode_KIND(str2) != kind)
+        return 0;
+    const void *data1 = PyUnicode_DATA(str1);
+    const void *data2 = PyUnicode_DATA(str2);
+    return memcmp(data1, data2, str1_length * kind) == 0;
+}
+
+// Adapted from CPython 3.13.1 (_PyUnicode_Equal)
+char CPyStr_Equal(PyObject *str1, PyObject *str2) {
+    if (str1 == str2) {
+        return 1;
+    }
+    Py_ssize_t str2_length = PyUnicode_GET_LENGTH(str2);
+    return _CPyStr_Equal_NoIdentCheck(str1, str2, str2_length);
+}
+
+char CPyStr_EqualLiteral(PyObject *str, PyObject *literal_str, Py_ssize_t literal_length) {
+    if (str == literal_str) {
+        return 1;
+    }
+    return _CPyStr_Equal_NoIdentCheck(str, literal_str, literal_length);
+}
+
+PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) {
+    if (PyUnicode_READY(str) != -1) {
+        if (CPyTagged_CheckShort(index)) {
+            Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+            Py_ssize_t size = PyUnicode_GET_LENGTH(str);
+            if (n < 0)
+                n += size;
+            if (n < 0 || n >= size) {
+                PyErr_SetString(PyExc_IndexError, "string index out of range");
+                return NULL;
+            }
+            enum PyUnicode_Kind kind = (enum PyUnicode_Kind)PyUnicode_KIND(str);
+            void *data = PyUnicode_DATA(str);
+            Py_UCS4 ch = PyUnicode_READ(kind, data, n);
+            PyObject *unicode = PyUnicode_New(1, ch);
+            if (unicode == NULL)
+                return NULL;
+
+            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
+                PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
+            } else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
+                PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
+            } else {
+                assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
+                PyUnicode_4BYTE_DATA(unicode)[0] = ch;
+            }
+            return unicode;
+        } else {
+            PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+            return NULL;
+        }
+    } else {
+        PyObject *index_obj = CPyTagged_AsObject(index);
+        return PyObject_GetItem(str, index_obj);
+    }
+}
+
+PyObject *CPyStr_GetItemUnsafe(PyObject *str, Py_ssize_t index) {
+    // This is unsafe since we don't check for overflow when doing <<.
+    return CPyStr_GetItem(str, index << 1);
+}
+
+// A simplification of _PyUnicode_JoinArray() from CPython 3.9.6
+PyObject *CPyStr_Build(Py_ssize_t len, ...) {
+    Py_ssize_t i;
+    va_list args;
+
+    // Calculate the total amount of space and check
+    // whether all components have the same kind.
+    Py_ssize_t sz = 0;
+    Py_UCS4 maxchar = 0;
+    int use_memcpy = 1; // Use memcpy by default
+    PyObject *last_obj = NULL;
+
+    va_start(args, len);
+    for (i = 0; i < len; i++) {
+        PyObject *item = va_arg(args, PyObject *);
+        if (!PyUnicode_Check(item)) {
+            PyErr_Format(PyExc_TypeError,
+                         "sequence item %zd: expected str instance,"
+                         " %.80s found",
+                         i, Py_TYPE(item)->tp_name);
+            return NULL;
+        }
+        if (PyUnicode_READY(item) == -1)
+            return NULL;
+
+        size_t add_sz = PyUnicode_GET_LENGTH(item);
+        Py_UCS4 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
+        maxchar = Py_MAX(maxchar, item_maxchar);
+
+        // Using size_t to avoid overflow during arithmetic calculation
+        if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
+            PyErr_SetString(PyExc_OverflowError,
+                            "join() result is too long for a Python string");
+            return NULL;
+        }
+        sz += add_sz;
+
+        // If these strings have different kind, we would call
+        // _PyUnicode_FastCopyCharacters() in the following part.
+        if (use_memcpy && last_obj != NULL) {
+            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
+                use_memcpy = 0;
+        }
+        last_obj = item;
+    }
+    va_end(args);
+
+    // Construct the string
+    PyObject *res = PyUnicode_New(sz, maxchar);
+    if (res == NULL)
+        return NULL;
+
+    if (use_memcpy) {
+        unsigned char *res_data = PyUnicode_1BYTE_DATA(res);
+        unsigned int kind = PyUnicode_KIND(res);
+
+        va_start(args, len);
+        for (i = 0; i < len; ++i) {
+            PyObject *item = va_arg(args, PyObject *);
+            Py_ssize_t itemlen = PyUnicode_GET_LENGTH(item);
+            if (itemlen != 0) {
+                memcpy(res_data, PyUnicode_DATA(item), kind * itemlen);
+                res_data += kind * itemlen;
+            }
+        }
+        va_end(args);
+        assert(res_data == PyUnicode_1BYTE_DATA(res) + kind * PyUnicode_GET_LENGTH(res));
+    } else {
+        Py_ssize_t res_offset = 0;
+
+        va_start(args, len);
+        for (i = 0; i < len; ++i) {
+            PyObject *item = va_arg(args, PyObject *);
+            Py_ssize_t itemlen = PyUnicode_GET_LENGTH(item);
+            if (itemlen != 0) {
+#if CPY_3_13_FEATURES
+                PyUnicode_CopyCharacters(res, res_offset, item, 0, itemlen);
+#else
+                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
+#endif
+                res_offset += itemlen;
+            }
+        }
+        va_end(args);
+        assert(res_offset == PyUnicode_GET_LENGTH(res));
+    }
+
+#ifdef Py_DEBUG
+    assert(_PyUnicode_CheckConsistency(res, 1));
+#endif
+    return res;
+}
+
+CPyTagged CPyStr_Find(PyObject *str, PyObject *substr, CPyTagged start, int direction) {
+    CPyTagged end = PyUnicode_GET_LENGTH(str) << 1;
+    return CPyStr_FindWithEnd(str, substr, start, end, direction);
+}
+
+CPyTagged CPyStr_FindWithEnd(PyObject *str, PyObject *substr, CPyTagged start, CPyTagged end, int direction) {
+    Py_ssize_t temp_start = CPyTagged_AsSsize_t(start);
+    if (temp_start == -1 && PyErr_Occurred()) {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return CPY_INT_TAG;
+    }
+    Py_ssize_t temp_end = CPyTagged_AsSsize_t(end);
+    if (temp_end == -1 && PyErr_Occurred()) {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return CPY_INT_TAG;
+    }
+    Py_ssize_t index = PyUnicode_Find(str, substr, temp_start, temp_end, direction);
+    if (unlikely(index == -2)) {
+        return CPY_INT_TAG;
+    }
+    return index << 1;
+}
+
+PyObject *CPyStr_Split(PyObject *str, PyObject *sep, CPyTagged max_split) {
+    Py_ssize_t temp_max_split = CPyTagged_AsSsize_t(max_split);
+    if (temp_max_split == -1 && PyErr_Occurred()) {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return NULL;
+    }
+    return PyUnicode_Split(str, sep, temp_max_split);
+}
+
+PyObject *CPyStr_RSplit(PyObject *str, PyObject *sep, CPyTagged max_split) {
+    Py_ssize_t temp_max_split = CPyTagged_AsSsize_t(max_split);
+    if (temp_max_split == -1 && PyErr_Occurred()) {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return NULL;
+    }
+    return PyUnicode_RSplit(str, sep, temp_max_split);
+}
+
+// This function has been copied from _PyUnicode_XStrip in cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
+static PyObject *_PyStr_XStrip(PyObject *self, int striptype, PyObject *sepobj) {
+    const void *data;
+    int kind;
+    Py_ssize_t i, j, len;
+    BLOOM_MASK sepmask;
+    Py_ssize_t seplen;
+
+    // This check is needed from Python 3.9 and earlier.
+    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
+        return NULL;
+
+    kind = PyUnicode_KIND(self);
+    data = PyUnicode_DATA(self);
+    len = PyUnicode_GET_LENGTH(self);
+    seplen = PyUnicode_GET_LENGTH(sepobj);
+    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
+                              PyUnicode_DATA(sepobj),
+                              seplen);
+
+    i = 0;
+    if (striptype != RIGHTSTRIP) {
+        while (i < len) {
+            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+            if (!BLOOM(sepmask, ch))
+                break;
+            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
+                break;
+            i++;
+        }
+    }
+
+    j = len;
+    if (striptype != LEFTSTRIP) {
+        j--;
+        while (j >= i) {
+            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
+            if (!BLOOM(sepmask, ch))
+                break;
+            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
+                break;
+            j--;
+        }
+
+        j++;
+    }
+
+    return PyUnicode_Substring(self, i, j);
+}
+
+// Copied from do_strip function in cpython.git/Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
+PyObject *_CPyStr_Strip(PyObject *self, int strip_type, PyObject *sep) {
+    if (sep == NULL || Py_IsNone(sep)) {
+        Py_ssize_t len, i, j;
+
+        // This check is needed from Python 3.9 and earlier.
+        if (PyUnicode_READY(self) == -1)
+            return NULL;
+
+        len = PyUnicode_GET_LENGTH(self);
+
+        if (PyUnicode_IS_ASCII(self)) {
+            const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
+
+            i = 0;
+            if (strip_type != RIGHTSTRIP) {
+                while (i < len) {
+                    Py_UCS1 ch = data[i];
+                    if (!_Py_ascii_whitespace[ch])
+                        break;
+                    i++;
+                }
+            }
+
+            j = len;
+            if (strip_type != LEFTSTRIP) {
+                j--;
+                while (j >= i) {
+                    Py_UCS1 ch = data[j];
+                    if (!_Py_ascii_whitespace[ch])
+                        break;
+                    j--;
+                }
+                j++;
+            }
+        }
+        else {
+            int kind = PyUnicode_KIND(self);
+            const void *data = PyUnicode_DATA(self);
+
+            i = 0;
+            if (strip_type != RIGHTSTRIP) {
+                while (i < len) {
+                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
+                    if (!Py_UNICODE_ISSPACE(ch))
+                        break;
+                    i++;
+                }
+            }
+
+            j = len;
+            if (strip_type != LEFTSTRIP) {
+                j--;
+                while (j >= i) {
+                    Py_UCS4 ch = PyUnicode_READ(kind, data, j);
+                    if (!Py_UNICODE_ISSPACE(ch))
+                        break;
+                    j--;
+                }
+                j++;
+            }
+        }
+
+        return PyUnicode_Substring(self, i, j);
+    }
+    return _PyStr_XStrip(self, strip_type, sep);
+}
+
+PyObject *CPyStr_Replace(PyObject *str, PyObject *old_substr,
+                         PyObject *new_substr, CPyTagged max_replace) {
+    Py_ssize_t temp_max_replace = CPyTagged_AsSsize_t(max_replace);
+    if (temp_max_replace == -1 && PyErr_Occurred()) {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return NULL;
+    }
+    return PyUnicode_Replace(str, old_substr, new_substr, temp_max_replace);
+}
+
+int CPyStr_Startswith(PyObject *self, PyObject *subobj) {
+    Py_ssize_t start = 0;
+    Py_ssize_t end = PyUnicode_GET_LENGTH(self);
+    if (PyTuple_Check(subobj)) {
+        Py_ssize_t i;
+        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
+            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
+            if (!PyUnicode_Check(substring)) {
+                PyErr_Format(PyExc_TypeError,
+                             "tuple for startswith must only contain str, "
+                             "not %.100s",
+                             Py_TYPE(substring)->tp_name);
+                return 2;
+            }
+            int result = PyUnicode_Tailmatch(self, substring, start, end, -1);
+            if (result) {
+                return 1;
+            }
+        }
+        return 0;
+    }
+    return PyUnicode_Tailmatch(self, subobj, start, end, -1);
+}
+
+int CPyStr_Endswith(PyObject *self, PyObject *subobj) {
+    Py_ssize_t start = 0;
+    Py_ssize_t end = PyUnicode_GET_LENGTH(self);
+    if (PyTuple_Check(subobj)) {
+        Py_ssize_t i;
+        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
+            PyObject *substring = PyTuple_GET_ITEM(subobj, i);
+            if (!PyUnicode_Check(substring)) {
+                PyErr_Format(PyExc_TypeError,
+                             "tuple for endswith must only contain str, "
+                             "not %.100s",
+                             Py_TYPE(substring)->tp_name);
+                return 2;
+            }
+            int result = PyUnicode_Tailmatch(self, substring, start, end, 1);
+            if (result) {
+                return 1;
+            }
+        }
+        return 0;
+    }
+    return PyUnicode_Tailmatch(self, subobj, start, end, 1);
+}
+
+PyObject *CPyStr_Removeprefix(PyObject *self, PyObject *prefix) {
+    Py_ssize_t end = PyUnicode_GET_LENGTH(self);
+    int match = PyUnicode_Tailmatch(self, prefix, 0, end, -1);
+    if (match) {
+        Py_ssize_t prefix_end = PyUnicode_GET_LENGTH(prefix);
+        return PyUnicode_Substring(self, prefix_end, end);
+    }
+    return Py_NewRef(self);
+}
+
+PyObject *CPyStr_Removesuffix(PyObject *self, PyObject *suffix) {
+    Py_ssize_t end = PyUnicode_GET_LENGTH(self);
+    int match = PyUnicode_Tailmatch(self, suffix, 0, end, 1);
+    if (match) {
+        Py_ssize_t suffix_end = PyUnicode_GET_LENGTH(suffix);
+        return PyUnicode_Substring(self, 0, end - suffix_end);
+    }
+    return Py_NewRef(self);
+}
+
+/* This does a dodgy attempt to append in place  */
+PyObject *CPyStr_Append(PyObject *o1, PyObject *o2) {
+    PyUnicode_Append(&o1, o2);
+    return o1;
+}
+
+PyObject *CPyStr_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
+    if (likely(PyUnicode_CheckExact(obj)
+               && CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
+        Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
+        Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
+        if (startn < 0) {
+            startn += PyUnicode_GET_LENGTH(obj);
+            if (startn < 0) {
+                startn = 0;
+            }
+        }
+        if (endn < 0) {
+            endn += PyUnicode_GET_LENGTH(obj);
+            if (endn < 0) {
+                endn = 0;
+            }
+        }
+        return PyUnicode_Substring(obj, startn, endn);
+    }
+    return CPyObject_GetSlice(obj, start, end);
+}
+
+/* Check if the given string is true (i.e. its length isn't zero) */
+bool CPyStr_IsTrue(PyObject *obj) {
+    Py_ssize_t length = PyUnicode_GET_LENGTH(obj);
+    return length != 0;
+}
+
+Py_ssize_t CPyStr_Size_size_t(PyObject *str) {
+    if (PyUnicode_READY(str) != -1) {
+        return PyUnicode_GET_LENGTH(str);
+    }
+    return -1;
+}
+
+PyObject *CPy_Decode(PyObject *obj, PyObject *encoding, PyObject *errors) {
+    const char *enc = NULL;
+    const char *err = NULL;
+    if (encoding) {
+        enc = PyUnicode_AsUTF8AndSize(encoding, NULL);
+        if (!enc) return NULL;
+    }
+    if (errors) {
+        err = PyUnicode_AsUTF8AndSize(errors, NULL);
+        if (!err) return NULL;
+    }
+    if (PyBytes_Check(obj)) {
+        return PyUnicode_Decode(((PyBytesObject *)obj)->ob_sval,
+                                ((PyVarObject *)obj)->ob_size,
+                                enc, err);
+    } else {
+        return PyUnicode_FromEncodedObject(obj, enc, err);
+    }
+}
+
+PyObject *CPy_DecodeUTF8(PyObject *bytes) {
+    if (PyBytes_CheckExact(bytes)) {
+        char *buffer = PyBytes_AsString(bytes);   // Borrowed reference
+        if (buffer == NULL) {
+            return NULL;
+        }
+        Py_ssize_t size = PyBytes_Size(bytes);
+        return PyUnicode_DecodeUTF8(buffer, size, "strict");
+    } else {
+        return PyUnicode_FromEncodedObject(bytes, "utf-8", "strict");
+    }
+}
+
+PyObject *CPy_DecodeASCII(PyObject *bytes) {
+    if (PyBytes_CheckExact(bytes)) {
+        char *buffer = PyBytes_AsString(bytes);   // Borrowed reference
+        if (buffer == NULL) {
+            return NULL;
+        }
+        Py_ssize_t size = PyBytes_Size(bytes);
+        return PyUnicode_DecodeASCII(buffer, size, "strict");;
+    } else {
+        return PyUnicode_FromEncodedObject(bytes, "ascii", "strict");
+    }
+}
+
+PyObject *CPy_DecodeLatin1(PyObject *bytes) {
+    if (PyBytes_CheckExact(bytes)) {
+        char *buffer = PyBytes_AsString(bytes);   // Borrowed reference
+        if (buffer == NULL) {
+            return NULL;
+        }
+        Py_ssize_t size = PyBytes_Size(bytes);
+        return PyUnicode_DecodeLatin1(buffer, size, "strict");
+    } else {
+        return PyUnicode_FromEncodedObject(bytes, "latin1", "strict");
+    }
+}
+
+PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors) {
+    const char *enc = NULL;
+    const char *err = NULL;
+    if (encoding) {
+        enc = PyUnicode_AsUTF8AndSize(encoding, NULL);
+        if (!enc) return NULL;
+    }
+    if (errors) {
+        err = PyUnicode_AsUTF8AndSize(errors, NULL);
+        if (!err) return NULL;
+    }
+    if (PyUnicode_Check(obj)) {
+        return PyUnicode_AsEncodedString(obj, enc, err);
+    } else {
+        PyErr_BadArgument();
+        return NULL;
+    }
+}
+
+Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start) {
+    Py_ssize_t temp_start = CPyTagged_AsSsize_t(start);
+    if (temp_start == -1 && PyErr_Occurred()) {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return -1;
+    }
+    Py_ssize_t end = PyUnicode_GET_LENGTH(unicode);
+    return PyUnicode_Count(unicode, substring, temp_start, end);
+}
+
+Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end) {
+    Py_ssize_t temp_start = CPyTagged_AsSsize_t(start);
+    if (temp_start == -1 && PyErr_Occurred()) {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return -1;
+    }
+    Py_ssize_t temp_end = CPyTagged_AsSsize_t(end);
+    if (temp_end == -1 && PyErr_Occurred()) {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return -1;
+    }
+    return PyUnicode_Count(unicode, substring, temp_start, temp_end);
+}
+
+
+CPyTagged CPyStr_Ord(PyObject *obj) {
+    Py_ssize_t s = PyUnicode_GET_LENGTH(obj);
+    if (s == 1) {
+        int kind = PyUnicode_KIND(obj);
+        return PyUnicode_READ(kind, PyUnicode_DATA(obj), 0) << 1;
+    }
+    PyErr_Format(
+        PyExc_TypeError, "ord() expected a character, but a string of length %zd found", s);
+    return CPY_INT_TAG;
+}
--- a/.venv/lib/python3.12/site-packages/mypyc/lib-rt/tuple_ops.c
+++ b/.venv/lib/python3.12/site-packages/mypyc/lib-rt/tuple_ops.c
@@ -0,0 +1,62 @@
+// Tuple primitive operations
+//
+// These are registered in mypyc.primitives.tuple_ops.
+
+#include <Python.h>
+#include "CPy.h"
+
+PyObject *CPySequenceTuple_GetItem(PyObject *tuple, CPyTagged index) {
+    if (CPyTagged_CheckShort(index)) {
+        Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
+        Py_ssize_t size = PyTuple_GET_SIZE(tuple);
+        if (n >= 0) {
+            if (n >= size) {
+                PyErr_SetString(PyExc_IndexError, "tuple index out of range");
+                return NULL;
+            }
+        } else {
+            n += size;
+            if (n < 0) {
+                PyErr_SetString(PyExc_IndexError, "tuple index out of range");
+                return NULL;
+            }
+        }
+        PyObject *result = PyTuple_GET_ITEM(tuple, n);
+        Py_INCREF(result);
+        return result;
+    } else {
+        PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
+        return NULL;
+    }
+}
+
+PyObject *CPySequenceTuple_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
+    if (likely(PyTuple_CheckExact(obj)
+               && CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
+        Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
+        Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
+        if (startn < 0) {
+            startn += PyTuple_GET_SIZE(obj);
+        }
+        if (endn < 0) {
+            endn += PyTuple_GET_SIZE(obj);
+        }
+        return PyTuple_GetSlice(obj, startn, endn);
+    }
+    return CPyObject_GetSlice(obj, start, end);
+}
+
+// No error checking
+PyObject *CPySequenceTuple_GetItemUnsafe(PyObject *tuple, Py_ssize_t index)
+{
+    PyObject *result = PyTuple_GET_ITEM(tuple, index);
+    Py_INCREF(result);
+    return result;
+}
+
+// PyTuple_SET_ITEM does no error checking,
+// and should only be used to fill in brand new tuples.
+void CPySequenceTuple_SetItemUnsafe(PyObject *tuple, Py_ssize_t index, PyObject *value)
+{
+    PyTuple_SET_ITEM(tuple, index, value);
+}