Initial commit: add .gitignore and README
Some checks failed
Tests / test (3.10) (push) Has been cancelled
Tests / test (3.11) (push) Has been cancelled
Tests / test (3.12) (push) Has been cancelled
Tests / lint (push) Has been cancelled
Tests / docker (push) Has been cancelled

This commit is contained in:
defiQUG
2026-02-09 21:51:42 -08:00
commit c052b07662
3146 changed files with 808305 additions and 0 deletions

View File

@@ -0,0 +1,975 @@
// Mypyc C API
#ifndef CPY_CPY_H
#define CPY_CPY_H
#include <stdbool.h>
#include <Python.h>
#include <frameobject.h>
#include <structmember.h>
#include <assert.h>
#include <stdint.h>
#include "pythonsupport.h"
#include "mypyc_util.h"
#ifdef __cplusplus
extern "C" {
#endif
#if 0
} // why isn't emacs smart enough to not indent this
#endif
#define CPYTHON_LARGE_INT_ERRMSG "Python int too large to convert to C ssize_t"
// Naming conventions:
//
// Tagged: tagged int
// Long: tagged long int (pointer)
// Short: tagged short int (unboxed)
// Ssize_t: A Py_ssize_t, which ought to be the same width as pointers
// Object: CPython object (PyObject *)
// Tuple type definitions needed for API functions
#ifndef MYPYC_DECLARED_tuple_T3OOO
#define MYPYC_DECLARED_tuple_T3OOO
typedef struct tuple_T3OOO {
PyObject *f0;
PyObject *f1;
PyObject *f2;
} tuple_T3OOO;
#endif
// Our return tuple wrapper for dictionary iteration helper.
#ifndef MYPYC_DECLARED_tuple_T3CIO
#define MYPYC_DECLARED_tuple_T3CIO
typedef struct tuple_T3CIO {
char f0; // Should continue?
CPyTagged f1; // Last dict offset
PyObject *f2; // Next dictionary key or value
} tuple_T3CIO;
#endif
// Same as above but for both key and value.
#ifndef MYPYC_DECLARED_tuple_T4CIOO
#define MYPYC_DECLARED_tuple_T4CIOO
typedef struct tuple_T4CIOO {
char f0; // Should continue?
CPyTagged f1; // Last dict offset
PyObject *f2; // Next dictionary key
PyObject *f3; // Next dictionary value
} tuple_T4CIOO;
#endif
// System-wide empty tuple constant
extern PyObject * __mypyc_empty_tuple__;
static inline PyObject *CPyTuple_LoadEmptyTupleConstant(void) {
#if !CPY_3_12_FEATURES
Py_INCREF(__mypyc_empty_tuple__);
#endif
return __mypyc_empty_tuple__;
}
// Native object operations
// Search backwards through the trait part of a vtable (which sits *before*
// the start of the vtable proper) looking for the subvtable describing a trait
// implementation. We don't do any bounds checking so we'd better be pretty sure
// we know that it is there.
static inline CPyVTableItem *CPy_FindTraitVtable(PyTypeObject *trait, CPyVTableItem *vtable) {
int i;
for (i = -3; ; i -= 3) {
if ((PyTypeObject *)vtable[i] == trait) {
return (CPyVTableItem *)vtable[i + 1];
}
}
}
// Use the same logic for offset table.
static inline size_t CPy_FindAttrOffset(PyTypeObject *trait, CPyVTableItem *vtable, size_t index) {
int i;
for (i = -3; ; i -= 3) {
if ((PyTypeObject *)vtable[i] == trait) {
return ((size_t *)vtable[i + 2])[index];
}
}
}
// Get attribute value using vtable (may return an undefined value)
#define CPY_GET_ATTR(obj, type, vtable_index, object_type, attr_type) \
((attr_type (*)(object_type *))((object_type *)obj)->vtable[vtable_index])((object_type *)obj)
#define CPY_GET_ATTR_TRAIT(obj, trait, vtable_index, object_type, attr_type) \
((attr_type (*)(object_type *))(CPy_FindTraitVtable(trait, ((object_type *)obj)->vtable))[vtable_index])((object_type *)obj)
// Set attribute value using vtable
#define CPY_SET_ATTR(obj, type, vtable_index, value, object_type, attr_type) \
((bool (*)(object_type *, attr_type))((object_type *)obj)->vtable[vtable_index])( \
(object_type *)obj, value)
#define CPY_SET_ATTR_TRAIT(obj, trait, vtable_index, value, object_type, attr_type) \
((bool (*)(object_type *, attr_type))(CPy_FindTraitVtable(trait, ((object_type *)obj)->vtable))[vtable_index])( \
(object_type *)obj, value)
#define CPY_GET_METHOD(obj, type, vtable_index, object_type, method_type) \
((method_type)(((object_type *)obj)->vtable[vtable_index]))
#define CPY_GET_METHOD_TRAIT(obj, trait, vtable_index, object_type, method_type) \
((method_type)(CPy_FindTraitVtable(trait, ((object_type *)obj)->vtable)[vtable_index]))
// Int operations
CPyTagged CPyTagged_FromSsize_t(Py_ssize_t value);
CPyTagged CPyTagged_FromVoidPtr(void *ptr);
CPyTagged CPyTagged_FromInt64(int64_t value);
PyObject *CPyTagged_AsObject(CPyTagged x);
PyObject *CPyTagged_StealAsObject(CPyTagged x);
Py_ssize_t CPyTagged_AsSsize_t(CPyTagged x);
void CPyTagged_IncRef(CPyTagged x);
void CPyTagged_DecRef(CPyTagged x);
void CPyTagged_XDecRef(CPyTagged x);
bool CPyTagged_IsEq_(CPyTagged left, CPyTagged right);
bool CPyTagged_IsLt_(CPyTagged left, CPyTagged right);
CPyTagged CPyTagged_Negate_(CPyTagged num);
CPyTagged CPyTagged_Invert_(CPyTagged num);
CPyTagged CPyTagged_Add_(CPyTagged left, CPyTagged right);
CPyTagged CPyTagged_Subtract_(CPyTagged left, CPyTagged right);
CPyTagged CPyTagged_Multiply_(CPyTagged left, CPyTagged right);
CPyTagged CPyTagged_FloorDivide_(CPyTagged left, CPyTagged right);
CPyTagged CPyTagged_Remainder_(CPyTagged left, CPyTagged right);
CPyTagged CPyTagged_BitwiseLongOp_(CPyTagged a, CPyTagged b, char op);
CPyTagged CPyTagged_Rshift_(CPyTagged left, CPyTagged right);
CPyTagged CPyTagged_Lshift_(CPyTagged left, CPyTagged right);
CPyTagged CPyTagged_BitLength(CPyTagged self);
PyObject *CPyTagged_Str(CPyTagged n);
CPyTagged CPyTagged_FromFloat(double f);
PyObject *CPyLong_FromStrWithBase(PyObject *o, CPyTagged base);
PyObject *CPyLong_FromStr(PyObject *o);
PyObject *CPyBool_Str(bool b);
int64_t CPyLong_AsInt64_(PyObject *o);
int64_t CPyInt64_Divide(int64_t x, int64_t y);
int64_t CPyInt64_Remainder(int64_t x, int64_t y);
int32_t CPyLong_AsInt32_(PyObject *o);
int32_t CPyInt32_Divide(int32_t x, int32_t y);
int32_t CPyInt32_Remainder(int32_t x, int32_t y);
void CPyInt32_Overflow(void);
int16_t CPyLong_AsInt16_(PyObject *o);
int16_t CPyInt16_Divide(int16_t x, int16_t y);
int16_t CPyInt16_Remainder(int16_t x, int16_t y);
void CPyInt16_Overflow(void);
uint8_t CPyLong_AsUInt8_(PyObject *o);
void CPyUInt8_Overflow(void);
double CPyTagged_TrueDivide(CPyTagged x, CPyTagged y);
static inline int CPyTagged_CheckLong(CPyTagged x) {
return x & CPY_INT_TAG;
}
static inline int CPyTagged_CheckShort(CPyTagged x) {
return !CPyTagged_CheckLong(x);
}
static inline void CPyTagged_INCREF(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
CPyTagged_IncRef(x);
}
}
static inline void CPyTagged_DECREF(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
CPyTagged_DecRef(x);
}
}
static inline void CPyTagged_XDECREF(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
CPyTagged_XDecRef(x);
}
}
static inline Py_ssize_t CPyTagged_ShortAsSsize_t(CPyTagged x) {
// NOTE: Assume that we sign extend.
return (Py_ssize_t)x >> 1;
}
static inline PyObject *CPyTagged_LongAsObject(CPyTagged x) {
// NOTE: Assume target is not a short int.
return (PyObject *)(x & ~CPY_INT_TAG);
}
static inline CPyTagged CPyTagged_FromObject(PyObject *object) {
int overflow;
// The overflow check knows about CPyTagged's width
Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow);
if (unlikely(overflow != 0)) {
Py_INCREF(object);
return ((CPyTagged)object) | CPY_INT_TAG;
} else {
return value << 1;
}
}
static inline CPyTagged CPyTagged_StealFromObject(PyObject *object) {
int overflow;
// The overflow check knows about CPyTagged's width
Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow);
if (unlikely(overflow != 0)) {
return ((CPyTagged)object) | CPY_INT_TAG;
} else {
Py_DECREF(object);
return value << 1;
}
}
static inline CPyTagged CPyTagged_BorrowFromObject(PyObject *object) {
int overflow;
// The overflow check knows about CPyTagged's width
Py_ssize_t value = CPyLong_AsSsize_tAndOverflow(object, &overflow);
if (unlikely(overflow != 0)) {
return ((CPyTagged)object) | CPY_INT_TAG;
} else {
return value << 1;
}
}
static inline bool CPyTagged_TooBig(Py_ssize_t value) {
// Micro-optimized for the common case where it fits.
return (size_t)value > CPY_TAGGED_MAX
&& (value >= 0 || value < CPY_TAGGED_MIN);
}
static inline bool CPyTagged_TooBigInt64(int64_t value) {
// Micro-optimized for the common case where it fits.
return (uint64_t)value > CPY_TAGGED_MAX
&& (value >= 0 || value < CPY_TAGGED_MIN);
}
static inline bool CPyTagged_IsAddOverflow(CPyTagged sum, CPyTagged left, CPyTagged right) {
// This check was copied from some of my old code I believe that it works :-)
return (Py_ssize_t)(sum ^ left) < 0 && (Py_ssize_t)(sum ^ right) < 0;
}
static inline bool CPyTagged_IsSubtractOverflow(CPyTagged diff, CPyTagged left, CPyTagged right) {
// This check was copied from some of my old code I believe that it works :-)
return (Py_ssize_t)(diff ^ left) < 0 && (Py_ssize_t)(diff ^ right) >= 0;
}
static inline bool CPyTagged_IsMultiplyOverflow(CPyTagged left, CPyTagged right) {
// This is conservative -- return false only in a small number of all non-overflow cases
return left >= (1U << (CPY_INT_BITS/2 - 1)) || right >= (1U << (CPY_INT_BITS/2 - 1));
}
static inline bool CPyTagged_MaybeFloorDivideFault(CPyTagged left, CPyTagged right) {
return right == 0 || left == -((size_t)1 << (CPY_INT_BITS-1));
}
static inline bool CPyTagged_MaybeRemainderFault(CPyTagged left, CPyTagged right) {
// Division/modulus can fault when dividing INT_MIN by -1, but we
// do our mods on still-tagged integers with the low-bit clear, so
// -1 is actually represented as -2 and can't overflow.
// Mod by 0 can still fault though.
return right == 0;
}
static inline bool CPyTagged_IsEq(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(left)) {
return left == right;
} else {
return CPyTagged_IsEq_(left, right);
}
}
static inline bool CPyTagged_IsNe(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(left)) {
return left != right;
} else {
return !CPyTagged_IsEq_(left, right);
}
}
static inline bool CPyTagged_IsLt(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
return (Py_ssize_t)left < (Py_ssize_t)right;
} else {
return CPyTagged_IsLt_(left, right);
}
}
static inline bool CPyTagged_IsGe(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
return (Py_ssize_t)left >= (Py_ssize_t)right;
} else {
return !CPyTagged_IsLt_(left, right);
}
}
static inline bool CPyTagged_IsGt(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
return (Py_ssize_t)left > (Py_ssize_t)right;
} else {
return CPyTagged_IsLt_(right, left);
}
}
static inline bool CPyTagged_IsLe(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
return (Py_ssize_t)left <= (Py_ssize_t)right;
} else {
return !CPyTagged_IsLt_(right, left);
}
}
static inline int64_t CPyLong_AsInt64(PyObject *o) {
if (likely(PyLong_Check(o))) {
PyLongObject *lobj = (PyLongObject *)o;
Py_ssize_t size = Py_SIZE(lobj);
if (likely(size == 1)) {
// Fast path
return CPY_LONG_DIGIT(lobj, 0);
} else if (likely(size == 0)) {
return 0;
}
}
// Slow path
return CPyLong_AsInt64_(o);
}
static inline int32_t CPyLong_AsInt32(PyObject *o) {
if (likely(PyLong_Check(o))) {
#if CPY_3_12_FEATURES
PyLongObject *lobj = (PyLongObject *)o;
size_t tag = CPY_LONG_TAG(lobj);
if (likely(tag == (1 << CPY_NON_SIZE_BITS))) {
// Fast path
return CPY_LONG_DIGIT(lobj, 0);
} else if (likely(tag == CPY_SIGN_ZERO)) {
return 0;
}
#else
PyLongObject *lobj = (PyLongObject *)o;
Py_ssize_t size = lobj->ob_base.ob_size;
if (likely(size == 1)) {
// Fast path
return CPY_LONG_DIGIT(lobj, 0);
} else if (likely(size == 0)) {
return 0;
}
#endif
}
// Slow path
return CPyLong_AsInt32_(o);
}
static inline int16_t CPyLong_AsInt16(PyObject *o) {
if (likely(PyLong_Check(o))) {
#if CPY_3_12_FEATURES
PyLongObject *lobj = (PyLongObject *)o;
size_t tag = CPY_LONG_TAG(lobj);
if (likely(tag == (1 << CPY_NON_SIZE_BITS))) {
// Fast path
digit x = CPY_LONG_DIGIT(lobj, 0);
if (x < 0x8000)
return x;
} else if (likely(tag == CPY_SIGN_ZERO)) {
return 0;
}
#else
PyLongObject *lobj = (PyLongObject *)o;
Py_ssize_t size = lobj->ob_base.ob_size;
if (likely(size == 1)) {
// Fast path
digit x = lobj->ob_digit[0];
if (x < 0x8000)
return x;
} else if (likely(size == 0)) {
return 0;
}
#endif
}
// Slow path
return CPyLong_AsInt16_(o);
}
static inline uint8_t CPyLong_AsUInt8(PyObject *o) {
if (likely(PyLong_Check(o))) {
#if CPY_3_12_FEATURES
PyLongObject *lobj = (PyLongObject *)o;
size_t tag = CPY_LONG_TAG(lobj);
if (likely(tag == (1 << CPY_NON_SIZE_BITS))) {
// Fast path
digit x = CPY_LONG_DIGIT(lobj, 0);
if (x < 256)
return x;
} else if (likely(tag == CPY_SIGN_ZERO)) {
return 0;
}
#else
PyLongObject *lobj = (PyLongObject *)o;
Py_ssize_t size = lobj->ob_base.ob_size;
if (likely(size == 1)) {
// Fast path
digit x = lobj->ob_digit[0];
if (x < 256)
return x;
} else if (likely(size == 0)) {
return 0;
}
#endif
}
// Slow path
return CPyLong_AsUInt8_(o);
}
static inline CPyTagged CPyTagged_Negate(CPyTagged num) {
if (likely(CPyTagged_CheckShort(num)
&& num != (CPyTagged) ((Py_ssize_t)1 << (CPY_INT_BITS - 1)))) {
// The only possibility of an overflow error happening when negating a short is if we
// attempt to negate the most negative number.
return -num;
}
return CPyTagged_Negate_(num);
}
static inline CPyTagged CPyTagged_Add(CPyTagged left, CPyTagged right) {
// TODO: Use clang/gcc extension __builtin_saddll_overflow instead.
if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
CPyTagged sum = left + right;
if (likely(!CPyTagged_IsAddOverflow(sum, left, right))) {
return sum;
}
}
return CPyTagged_Add_(left, right);
}
static inline CPyTagged CPyTagged_Subtract(CPyTagged left, CPyTagged right) {
// TODO: Use clang/gcc extension __builtin_saddll_overflow instead.
if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
CPyTagged diff = left - right;
if (likely(!CPyTagged_IsSubtractOverflow(diff, left, right))) {
return diff;
}
}
return CPyTagged_Subtract_(left, right);
}
static inline CPyTagged CPyTagged_Multiply(CPyTagged left, CPyTagged right) {
// TODO: Consider using some clang/gcc extension to check for overflow
if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)) {
if (!CPyTagged_IsMultiplyOverflow(left, right)) {
return left * CPyTagged_ShortAsSsize_t(right);
}
}
return CPyTagged_Multiply_(left, right);
}
static inline CPyTagged CPyTagged_FloorDivide(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(left)
&& CPyTagged_CheckShort(right)
&& !CPyTagged_MaybeFloorDivideFault(left, right)) {
Py_ssize_t result = CPyTagged_ShortAsSsize_t(left) / CPyTagged_ShortAsSsize_t(right);
if (((Py_ssize_t)left < 0) != (((Py_ssize_t)right) < 0)) {
if (result * right != left) {
// Round down
result--;
}
}
return result << 1;
}
return CPyTagged_FloorDivide_(left, right);
}
static inline CPyTagged CPyTagged_Remainder(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right)
&& !CPyTagged_MaybeRemainderFault(left, right)) {
Py_ssize_t result = (Py_ssize_t)left % (Py_ssize_t)right;
if (((Py_ssize_t)right < 0) != ((Py_ssize_t)left < 0) && result != 0) {
result += right;
}
return result;
}
return CPyTagged_Remainder_(left, right);
}
// Bitwise '~'
static inline CPyTagged CPyTagged_Invert(CPyTagged num) {
if (likely(CPyTagged_CheckShort(num) && num != CPY_TAGGED_ABS_MIN)) {
return ~num & ~CPY_INT_TAG;
}
return CPyTagged_Invert_(num);
}
// Bitwise '&'
static inline CPyTagged CPyTagged_And(CPyTagged left, CPyTagged right) {
if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
return left & right;
}
return CPyTagged_BitwiseLongOp_(left, right, '&');
}
// Bitwise '|'
static inline CPyTagged CPyTagged_Or(CPyTagged left, CPyTagged right) {
if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
return left | right;
}
return CPyTagged_BitwiseLongOp_(left, right, '|');
}
// Bitwise '^'
static inline CPyTagged CPyTagged_Xor(CPyTagged left, CPyTagged right) {
if (likely(CPyTagged_CheckShort(left) && CPyTagged_CheckShort(right))) {
return left ^ right;
}
return CPyTagged_BitwiseLongOp_(left, right, '^');
}
// Bitwise '>>'
static inline CPyTagged CPyTagged_Rshift(CPyTagged left, CPyTagged right) {
if (likely(CPyTagged_CheckShort(left)
&& CPyTagged_CheckShort(right)
&& (Py_ssize_t)right >= 0)) {
CPyTagged count = CPyTagged_ShortAsSsize_t(right);
if (unlikely(count >= CPY_INT_BITS)) {
if ((Py_ssize_t)left >= 0) {
return 0;
} else {
return CPyTagged_ShortFromInt(-1);
}
}
return ((Py_ssize_t)left >> count) & ~CPY_INT_TAG;
}
return CPyTagged_Rshift_(left, right);
}
static inline bool IsShortLshiftOverflow(Py_ssize_t short_int, Py_ssize_t shift) {
return ((Py_ssize_t)(short_int << shift) >> shift) != short_int;
}
// Bitwise '<<'
static inline CPyTagged CPyTagged_Lshift(CPyTagged left, CPyTagged right) {
if (likely(CPyTagged_CheckShort(left)
&& CPyTagged_CheckShort(right)
&& (Py_ssize_t)right >= 0
&& right < CPY_INT_BITS * 2)) {
CPyTagged shift = CPyTagged_ShortAsSsize_t(right);
if (!IsShortLshiftOverflow(left, shift))
// Short integers, no overflow
return left << shift;
}
return CPyTagged_Lshift_(left, right);
}
// Float operations
double CPyFloat_FloorDivide(double x, double y);
double CPyFloat_Pow(double x, double y);
double CPyFloat_Sin(double x);
double CPyFloat_Cos(double x);
double CPyFloat_Tan(double x);
double CPyFloat_Sqrt(double x);
double CPyFloat_Exp(double x);
double CPyFloat_Log(double x);
CPyTagged CPyFloat_Floor(double x);
CPyTagged CPyFloat_Ceil(double x);
double CPyFloat_FromTagged(CPyTagged x);
bool CPyFloat_IsInf(double x);
bool CPyFloat_IsNaN(double x);
// Generic operations (that work with arbitrary types)
/* We use intentionally non-inlined decrefs in rarely executed code
* paths since it pretty substantially speeds up compile time. We have
* our own copies both to avoid the null check in Py_DecRef and to avoid
* making an indirect PIC call. */
CPy_NOINLINE
static void CPy_DecRef(PyObject *p) {
CPy_DECREF(p);
}
CPy_NOINLINE
static void CPy_XDecRef(PyObject *p) {
CPy_XDECREF(p);
}
static inline CPyTagged CPyObject_Size(PyObject *obj) {
Py_ssize_t s = PyObject_Size(obj);
if (s < 0) {
return CPY_INT_TAG;
} else {
// Technically __len__ could return a really big number, so we
// should allow this to produce a boxed int. In practice it
// shouldn't ever if the data structure actually contains all
// the elements, but...
return CPyTagged_FromSsize_t(s);
}
}
#ifdef MYPYC_LOG_GETATTR
static void CPy_LogGetAttr(const char *method, PyObject *obj, PyObject *attr) {
PyObject *module = PyImport_ImportModule("getattr_hook");
if (module) {
PyObject *res = PyObject_CallMethodObjArgs(module, method, obj, attr, NULL);
Py_XDECREF(res);
Py_DECREF(module);
}
PyErr_Clear();
}
#else
#define CPy_LogGetAttr(method, obj, attr) (void)0
#endif
// Intercept a method call and log it. This needs to be a macro
// because there is no API that accepts va_args for making a
// call. Worse, it needs to use the comma operator to return the right
// value.
#define CPyObject_CallMethodObjArgs(obj, attr, ...) \
(CPy_LogGetAttr("log_method", (obj), (attr)), \
PyObject_CallMethodObjArgs((obj), (attr), __VA_ARGS__))
// This one is a macro for consistency with the above, I guess.
#define CPyObject_GetAttr(obj, attr) \
(CPy_LogGetAttr("log", (obj), (attr)), \
PyObject_GetAttr((obj), (attr)))
CPyTagged CPyObject_Hash(PyObject *o);
PyObject *CPyObject_GetAttr3(PyObject *v, PyObject *name, PyObject *defl);
PyObject *CPyIter_Next(PyObject *iter);
PyObject *CPyNumber_Power(PyObject *base, PyObject *index);
PyObject *CPyNumber_InPlacePower(PyObject *base, PyObject *index);
PyObject *CPyObject_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
// List operations
PyObject *CPyList_Build(Py_ssize_t len, ...);
PyObject *CPyList_GetItem(PyObject *list, CPyTagged index);
PyObject *CPyList_GetItemShort(PyObject *list, CPyTagged index);
PyObject *CPyList_GetItemBorrow(PyObject *list, CPyTagged index);
PyObject *CPyList_GetItemShortBorrow(PyObject *list, CPyTagged index);
PyObject *CPyList_GetItemInt64(PyObject *list, int64_t index);
PyObject *CPyList_GetItemInt64Borrow(PyObject *list, int64_t index);
bool CPyList_SetItem(PyObject *list, CPyTagged index, PyObject *value);
void CPyList_SetItemUnsafe(PyObject *list, Py_ssize_t index, PyObject *value);
bool CPyList_SetItemInt64(PyObject *list, int64_t index, PyObject *value);
PyObject *CPyList_PopLast(PyObject *obj);
PyObject *CPyList_Pop(PyObject *obj, CPyTagged index);
CPyTagged CPyList_Count(PyObject *obj, PyObject *value);
int CPyList_Insert(PyObject *list, CPyTagged index, PyObject *value);
PyObject *CPyList_Extend(PyObject *o1, PyObject *o2);
int CPyList_Remove(PyObject *list, PyObject *obj);
CPyTagged CPyList_Index(PyObject *list, PyObject *obj);
PyObject *CPySequence_Sort(PyObject *seq);
PyObject *CPySequence_Multiply(PyObject *seq, CPyTagged t_size);
PyObject *CPySequence_RMultiply(CPyTagged t_size, PyObject *seq);
PyObject *CPySequence_InPlaceMultiply(PyObject *seq, CPyTagged t_size);
PyObject *CPyList_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
char CPyList_Clear(PyObject *list);
PyObject *CPyList_Copy(PyObject *list);
int CPySequence_Check(PyObject *obj);
// Dict operations
PyObject *CPyDict_GetItem(PyObject *dict, PyObject *key);
int CPyDict_SetItem(PyObject *dict, PyObject *key, PyObject *value);
PyObject *CPyDict_Get(PyObject *dict, PyObject *key, PyObject *fallback);
PyObject *CPyDict_GetWithNone(PyObject *dict, PyObject *key);
PyObject *CPyDict_SetDefault(PyObject *dict, PyObject *key, PyObject *value);
PyObject *CPyDict_SetDefaultWithNone(PyObject *dict, PyObject *key);
PyObject *CPyDict_SetDefaultWithEmptyDatatype(PyObject *dict, PyObject *key, int data_type);
PyObject *CPyDict_Build(Py_ssize_t size, ...);
int CPyDict_Update(PyObject *dict, PyObject *stuff);
int CPyDict_UpdateInDisplay(PyObject *dict, PyObject *stuff);
int CPyDict_UpdateFromAny(PyObject *dict, PyObject *stuff);
PyObject *CPyDict_FromAny(PyObject *obj);
PyObject *CPyDict_KeysView(PyObject *dict);
PyObject *CPyDict_ValuesView(PyObject *dict);
PyObject *CPyDict_ItemsView(PyObject *dict);
PyObject *CPyDict_Keys(PyObject *dict);
PyObject *CPyDict_Values(PyObject *dict);
PyObject *CPyDict_Items(PyObject *dict);
char CPyDict_Clear(PyObject *dict);
PyObject *CPyDict_Copy(PyObject *dict);
PyObject *CPyDict_GetKeysIter(PyObject *dict);
PyObject *CPyDict_GetItemsIter(PyObject *dict);
PyObject *CPyDict_GetValuesIter(PyObject *dict);
tuple_T3CIO CPyDict_NextKey(PyObject *dict_or_iter, CPyTagged offset);
tuple_T3CIO CPyDict_NextValue(PyObject *dict_or_iter, CPyTagged offset);
tuple_T4CIOO CPyDict_NextItem(PyObject *dict_or_iter, CPyTagged offset);
int CPyMapping_Check(PyObject *obj);
// Check that dictionary didn't change size during iteration.
static inline char CPyDict_CheckSize(PyObject *dict, Py_ssize_t size) {
if (!PyDict_CheckExact(dict)) {
// Dict subclasses will be checked by Python runtime.
return 1;
}
Py_ssize_t dict_size = PyDict_Size(dict);
if (size != dict_size) {
PyErr_SetString(PyExc_RuntimeError, "dictionary changed size during iteration");
return 0;
}
return 1;
}
// Str operations
// Macros for strip type. These values are copied from CPython.
#define LEFTSTRIP 0
#define RIGHTSTRIP 1
#define BOTHSTRIP 2
char CPyStr_Equal(PyObject *str1, PyObject *str2);
char CPyStr_EqualLiteral(PyObject *str, PyObject *literal_str, Py_ssize_t literal_length);
PyObject *CPyStr_Build(Py_ssize_t len, ...);
PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index);
PyObject *CPyStr_GetItemUnsafe(PyObject *str, Py_ssize_t index);
CPyTagged CPyStr_Find(PyObject *str, PyObject *substr, CPyTagged start, int direction);
CPyTagged CPyStr_FindWithEnd(PyObject *str, PyObject *substr, CPyTagged start, CPyTagged end, int direction);
PyObject *CPyStr_Split(PyObject *str, PyObject *sep, CPyTagged max_split);
PyObject *CPyStr_RSplit(PyObject *str, PyObject *sep, CPyTagged max_split);
PyObject *_CPyStr_Strip(PyObject *self, int strip_type, PyObject *sep);
static inline PyObject *CPyStr_Strip(PyObject *self, PyObject *sep) {
return _CPyStr_Strip(self, BOTHSTRIP, sep);
}
static inline PyObject *CPyStr_LStrip(PyObject *self, PyObject *sep) {
return _CPyStr_Strip(self, LEFTSTRIP, sep);
}
static inline PyObject *CPyStr_RStrip(PyObject *self, PyObject *sep) {
return _CPyStr_Strip(self, RIGHTSTRIP, sep);
}
PyObject *CPyStr_Replace(PyObject *str, PyObject *old_substr, PyObject *new_substr, CPyTagged max_replace);
PyObject *CPyStr_Append(PyObject *o1, PyObject *o2);
PyObject *CPyStr_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
int CPyStr_Startswith(PyObject *self, PyObject *subobj);
int CPyStr_Endswith(PyObject *self, PyObject *subobj);
PyObject *CPyStr_Removeprefix(PyObject *self, PyObject *prefix);
PyObject *CPyStr_Removesuffix(PyObject *self, PyObject *suffix);
bool CPyStr_IsTrue(PyObject *obj);
Py_ssize_t CPyStr_Size_size_t(PyObject *str);
PyObject *CPy_Decode(PyObject *obj, PyObject *encoding, PyObject *errors);
PyObject *CPy_DecodeUTF8(PyObject *bytes);
PyObject *CPy_DecodeASCII(PyObject *bytes);
PyObject *CPy_DecodeLatin1(PyObject *bytes);
PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors);
Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start);
Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end);
CPyTagged CPyStr_Ord(PyObject *obj);
// Bytes operations
PyObject *CPyBytes_Build(Py_ssize_t len, ...);
PyObject *CPyBytes_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
CPyTagged CPyBytes_GetItem(PyObject *o, CPyTagged index);
PyObject *CPyBytes_Concat(PyObject *a, PyObject *b);
PyObject *CPyBytes_Join(PyObject *sep, PyObject *iter);
CPyTagged CPyBytes_Ord(PyObject *obj);
int CPyBytes_Compare(PyObject *left, PyObject *right);
// Set operations
bool CPySet_Remove(PyObject *set, PyObject *key);
// Tuple operations
PyObject *CPySequenceTuple_GetItem(PyObject *tuple, CPyTagged index);
PyObject *CPySequenceTuple_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end);
PyObject *CPySequenceTuple_GetItemUnsafe(PyObject *tuple, Py_ssize_t index);
void CPySequenceTuple_SetItemUnsafe(PyObject *tuple, Py_ssize_t index, PyObject *value);
// Exception operations
// mypyc is not very good at dealing with refcount management of
// pointers that might be NULL. As a workaround for this, the
// exception APIs that might want to return NULL pointers instead
// return properly refcounted pointers to this dummy object.
struct ExcDummyStruct { PyObject_HEAD };
extern struct ExcDummyStruct _CPy_ExcDummyStruct;
extern PyObject *_CPy_ExcDummy;
static inline void _CPy_ToDummy(PyObject **p) {
if (*p == NULL) {
Py_INCREF(_CPy_ExcDummy);
*p = _CPy_ExcDummy;
}
}
static inline PyObject *_CPy_FromDummy(PyObject *p) {
if (p == _CPy_ExcDummy) return NULL;
Py_INCREF(p);
return p;
}
static int CPy_NoErrOccurred(void) {
return PyErr_Occurred() == NULL;
}
static inline bool CPy_KeepPropagating(void) {
return 0;
}
// We want to avoid the public PyErr_GetExcInfo API for these because
// it requires a bunch of spurious refcount traffic on the parts of
// the triple we don't care about.
#define CPy_ExcState() PyThreadState_GET()->exc_info
void CPy_Raise(PyObject *exc);
void CPy_Reraise(void);
void CPyErr_SetObjectAndTraceback(PyObject *type, PyObject *value, PyObject *traceback);
tuple_T3OOO CPy_CatchError(void);
void CPy_RestoreExcInfo(tuple_T3OOO info);
bool CPy_ExceptionMatches(PyObject *type);
PyObject *CPy_GetExcValue(void);
tuple_T3OOO CPy_GetExcInfo(void);
void _CPy_GetExcInfo(PyObject **p_type, PyObject **p_value, PyObject **p_traceback);
void CPyError_OutOfMemory(void);
void CPy_TypeError(const char *expected, PyObject *value);
void CPy_AddTraceback(const char *filename, const char *funcname, int line, PyObject *globals);
void CPy_TypeErrorTraceback(const char *filename, const char *funcname, int line,
PyObject *globals, const char *expected, PyObject *value);
void CPy_AttributeError(const char *filename, const char *funcname, const char *classname,
const char *attrname, int line, PyObject *globals);
// Misc operations
#define CPy_TRASHCAN_BEGIN(op, dealloc) Py_TRASHCAN_BEGIN(op, dealloc)
#define CPy_TRASHCAN_END(op) Py_TRASHCAN_END
// Tweaked version of _PyArg_Parser in CPython
typedef struct CPyArg_Parser {
const char *format;
const char * const *keywords;
const char *fname;
const char *custom_msg;
int pos; /* number of positional-only arguments */
int min; /* minimal number of arguments */
int max; /* maximal number of positional arguments */
int has_required_kws; /* are there any keyword-only arguments? */
int required_kwonly_start;
int varargs; /* does the function accept *args or **kwargs? */
PyObject *kwtuple; /* tuple of keyword parameter names */
struct CPyArg_Parser *next;
} CPyArg_Parser;
// mypy lets ints silently coerce to floats, so a mypyc runtime float
// might be an int also
static inline bool CPyFloat_Check(PyObject *o) {
return PyFloat_Check(o) || PyLong_Check(o);
}
// TODO: find an unified way to avoid inline functions in non-C back ends that can not
// use inline functions
static inline bool CPy_TypeCheck(PyObject *o, PyObject *type) {
return PyObject_TypeCheck(o, (PyTypeObject *)type);
}
static inline PyObject *CPy_TYPE(PyObject *obj) {
PyObject *result = (PyObject *)Py_TYPE(obj);
Py_INCREF(result);
return result;
}
PyObject *CPy_CalculateMetaclass(PyObject *type, PyObject *o);
PyObject *CPy_GetCoro(PyObject *obj);
PyObject *CPyIter_Send(PyObject *iter, PyObject *val);
int CPy_YieldFromErrorHandle(PyObject *iter, PyObject **outp);
PyObject *CPy_FetchStopIterationValue(void);
PyObject *CPyType_FromTemplate(PyObject *template_,
PyObject *orig_bases,
PyObject *modname);
PyObject *CPyType_FromTemplateWrapper(PyObject *template_,
PyObject *orig_bases,
PyObject *modname);
int CPyDataclass_SleightOfHand(PyObject *dataclass_dec, PyObject *tp,
PyObject *dict, PyObject *annotations,
PyObject *dataclass_type);
PyObject *CPyPickle_SetState(PyObject *obj, PyObject *state);
PyObject *CPyPickle_GetState(PyObject *obj);
CPyTagged CPyTagged_Id(PyObject *o);
void CPyDebug_Print(const char *msg);
void CPyDebug_PrintObject(PyObject *obj);
void CPy_Init(void);
int CPyArg_ParseTupleAndKeywords(PyObject *, PyObject *,
const char *, const char *, const char * const *, ...);
int CPyArg_ParseStackAndKeywords(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...);
int CPyArg_ParseStackAndKeywordsNoArgs(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...);
int CPyArg_ParseStackAndKeywordsOneArg(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...);
int CPyArg_ParseStackAndKeywordsSimple(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...);
int CPySequence_CheckUnpackCount(PyObject *sequence, Py_ssize_t expected);
int CPyStatics_Initialize(PyObject **statics,
const char * const *strings,
const char * const *bytestrings,
const char * const *ints,
const double *floats,
const double *complex_numbers,
const int *tuples,
const int *frozensets);
PyObject *CPy_Super(PyObject *builtins, PyObject *self);
PyObject *CPy_CallReverseOpMethod(PyObject *left, PyObject *right, const char *op,
_Py_Identifier *method);
bool CPyImport_ImportMany(PyObject *modules, CPyModule **statics[], PyObject *globals,
PyObject *tb_path, PyObject *tb_function, Py_ssize_t *tb_lines);
PyObject *CPyImport_ImportFromMany(PyObject *mod_id, PyObject *names, PyObject *as_names,
PyObject *globals);
PyObject *CPySingledispatch_RegisterFunction(PyObject *singledispatch_func, PyObject *cls,
PyObject *func);
PyObject *CPy_GetAIter(PyObject *obj);
PyObject *CPy_GetANext(PyObject *aiter);
void CPy_SetTypeAliasTypeComputeFunction(PyObject *alias, PyObject *compute_value);
void CPyTrace_LogEvent(const char *location, const char *line, const char *op, const char *details);
static inline PyObject *CPyObject_GenericGetAttr(PyObject *self, PyObject *name) {
return _PyObject_GenericGetAttrWithDict(self, name, NULL, 1);
}
static inline int CPyObject_GenericSetAttr(PyObject *self, PyObject *name, PyObject *value) {
return _PyObject_GenericSetAttrWithDict(self, name, value, NULL);
}
PyObject *CPy_SetupObject(PyObject *type);
#if CPY_3_11_FEATURES
PyObject *CPy_GetName(PyObject *obj);
#endif
#if CPY_3_14_FEATURES
void CPy_SetImmortal(PyObject *obj);
#endif
#ifdef __cplusplus
}
#endif
#endif // CPY_CPY_H

View File

@@ -0,0 +1,68 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_AVX
#include <immintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_AVX_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_AVX_USE_ASM 1
# else
# define BASE64_AVX_USE_ASM 0
# endif
#endif
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/dec_loop.c"
#if BASE64_AVX_USE_ASM
# include "./enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
# include "../ssse3/enc_loop.c"
#endif
#endif // HAVE_AVX
void
base64_stream_encode_avx BASE64_ENC_PARAMS
{
#if HAVE_AVX
#include "../generic/enc_head.c"
// For supported compilers, use a hand-optimized inline assembly
// encoder. Otherwise fall back on the SSSE3 encoder, but compiled with
// AVX flags to generate better optimized AVX code.
#if BASE64_AVX_USE_ASM
enc_loop_avx(&s, &slen, &o, &olen);
#else
enc_loop_ssse3(&s, &slen, &o, &olen);
#endif
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_avx BASE64_DEC_PARAMS
{
#if HAVE_AVX
#include "../generic/dec_head.c"
dec_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View File

@@ -0,0 +1,264 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads register R0 from memory. The
// offset at which the register is loaded is set by the given round.
#define LOAD(R0, ROUND) \
"vlddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
// Generate a block of inline assembly that deinterleaves and shuffles register
// R0 using preloaded constants. Outputs in R0 and R1.
#define SHUF(R0, R1, R2) \
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
"vpor %["R1"], %["R2"], %["R1"] \n\t"
// Generate a block of inline assembly that takes R0 and R1 and translates
// their contents to the base64 alphabet, using preloaded constants.
#define TRAN(R0, R1, R2) \
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
// Generate a block of inline assembly that stores the given register R0 at an
// offset set by the given round.
#define STOR(R0, ROUND) \
"vmovdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result. Then update
// the source and destination pointers.
#define ROUND() \
LOAD("a", 0) \
SHUF("a", "b", "c") \
TRAN("a", "b", "c") \
STOR("a", 0) \
"add $12, %[src] \n\t" \
"add $16, %[dst] \n\t"
// Define a macro that initiates a three-way interleaved encoding round by
// preloading registers a, b and c from memory.
// The register graph shows which registers are in use during each step, and
// is a visual aid for choosing registers for that step. Symbol index:
//
// + indicates that a register is loaded by that step.
// | indicates that a register is in use and must not be touched.
// - indicates that a register is decommissioned by that step.
// x indicates that a register is used as a temporary by that step.
// V indicates that a register is an input or output to the macro.
//
#define ROUND_3_INIT() /* a b c d e f */ \
LOAD("a", 0) /* + */ \
SHUF("a", "d", "e") /* | + x */ \
LOAD("b", 1) /* | + | */ \
TRAN("a", "d", "e") /* | | - x */ \
LOAD("c", 2) /* V V V */
// Define a macro that translates, shuffles and stores the input registers A, B
// and C, and preloads registers D, E and F for the next round.
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
// and F back into the next round as input registers A, B and C. The macro
// carefully interleaves memory operations with data operations for optimal
// pipelined performance.
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
LOAD(D, (ROUND + 3)) /* V V V + */ \
SHUF(B, E, F) /* | | | | + x */ \
STOR(A, (ROUND + 0)) /* - | | | | */ \
TRAN(B, E, F) /* | | | - x */ \
LOAD(E, (ROUND + 4)) /* | | | + */ \
SHUF(C, A, F) /* + | | | | x */ \
STOR(B, (ROUND + 1)) /* | - | | | */ \
TRAN(C, A, F) /* - | | | x */ \
LOAD(F, (ROUND + 5)) /* | | | + */ \
SHUF(D, A, B) /* + x | | | | */ \
STOR(C, (ROUND + 2)) /* | - | | | */ \
TRAN(D, A, B) /* - x V V V */
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
// registers D, E and F, and translating, shuffling and storing them.
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
SHUF(E, A, B) /* + x V V V */ \
STOR(D, (ROUND + 3)) /* | - | | */ \
TRAN(E, A, B) /* - x | | */ \
SHUF(F, C, D) /* + x | | */ \
STOR(E, (ROUND + 4)) /* | - | */ \
TRAN(F, C, D) /* - x | */ \
STOR(F, (ROUND + 5)) /* - */
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
#define ROUND_3_A(ROUND) \
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
// Define a type B round. Inputs and outputs are swapped with regard to type A.
#define ROUND_3_B(ROUND) \
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
// Terminating macro for a type A round.
#define ROUND_3_A_LAST(ROUND) \
ROUND_3_A(ROUND) \
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
// Terminating macro for a type B round.
#define ROUND_3_B_LAST(ROUND) \
ROUND_3_B(ROUND) \
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_avx (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// For a clearer explanation of the algorithm used by this function,
// please refer to the plain (not inline assembly) implementation. This
// function follows the same basic logic.
if (*slen < 16) {
return;
}
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
// bytes, so "reserve" four bytes from the input buffer to ensure that
// we never read beyond the end of the input buffer.
size_t rounds = (*slen - 4) / 12;
*slen -= rounds * 12; // 12 bytes consumed per round
*olen += rounds * 16; // 16 bytes produced per round
// Number of times to go through the 36x loop.
size_t loops = rounds / 36;
// Number of rounds remaining after the 36x loop.
rounds %= 36;
// Lookup tables.
const __m128i lut0 = _mm_set_epi8(
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
const __m128i lut1 = _mm_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Temporary registers.
__m128i a, b, c, d, e, f;
__asm__ volatile (
// If there are 36 rounds or more, enter a 36x unrolled loop of
// interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations (table lookups,
// etc) to maximize pipeline throughput.
" test %[loops], %[loops] \n\t"
" jz 18f \n\t"
" jmp 36f \n\t"
" \n\t"
".balign 64 \n\t"
"36: " ROUND_3_INIT()
" " ROUND_3_A( 0)
" " ROUND_3_B( 3)
" " ROUND_3_A( 6)
" " ROUND_3_B( 9)
" " ROUND_3_A(12)
" " ROUND_3_B(15)
" " ROUND_3_A(18)
" " ROUND_3_B(21)
" " ROUND_3_A(24)
" " ROUND_3_B(27)
" " ROUND_3_A_LAST(30)
" add $(12 * 36), %[src] \n\t"
" add $(16 * 36), %[dst] \n\t"
" dec %[loops] \n\t"
" jnz 36b \n\t"
// Enter an 18x unrolled loop for rounds of 18 or more.
"18: cmp $18, %[rounds] \n\t"
" jl 9f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B(3)
" " ROUND_3_A(6)
" " ROUND_3_B(9)
" " ROUND_3_A_LAST(12)
" sub $18, %[rounds] \n\t"
" add $(12 * 18), %[src] \n\t"
" add $(16 * 18), %[dst] \n\t"
// Enter a 9x unrolled loop for rounds of 9 or more.
"9: cmp $9, %[rounds] \n\t"
" jl 6f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B_LAST(3)
" sub $9, %[rounds] \n\t"
" add $(12 * 9), %[src] \n\t"
" add $(16 * 9), %[dst] \n\t"
// Enter a 6x unrolled loop for rounds of 6 or more.
"6: cmp $6, %[rounds] \n\t"
" jl 55f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A_LAST(0)
" sub $6, %[rounds] \n\t"
" add $(12 * 6), %[src] \n\t"
" add $(16 * 6), %[dst] \n\t"
// Dispatch the remaining rounds 0..5.
"55: cmp $3, %[rounds] \n\t"
" jg 45f \n\t"
" je 3f \n\t"
" cmp $1, %[rounds] \n\t"
" jg 2f \n\t"
" je 1f \n\t"
" jmp 0f \n\t"
"45: cmp $4, %[rounds] \n\t"
" je 4f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"5: " ROUND()
"4: " ROUND()
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [rounds] "+r" (rounds),
[loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[a] "=&x" (a),
[b] "=&x" (b),
[c] "=&x" (c),
[d] "=&x" (d),
[e] "=&x" (e),
[f] "=&x" (f)
// Inputs (not modified).
: [lut0] "x" (lut0),
[lut1] "x" (lut1),
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
[msk1] "x" (_mm_set1_epi32(0x04000040)),
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
[msk3] "x" (_mm_set1_epi32(0x01000010)),
[n51] "x" (_mm_set1_epi8(51)),
[n25] "x" (_mm_set1_epi8(25))
// Clobbers.
: "cc", "memory"
);
}
#pragma GCC diagnostic pop

View File

@@ -0,0 +1,58 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_AVX2
#include <immintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_AVX2_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_AVX2_USE_ASM 1
# else
# define BASE64_AVX2_USE_ASM 0
# endif
#endif
#include "./dec_reshuffle.c"
#include "./dec_loop.c"
#if BASE64_AVX2_USE_ASM
# include "./enc_loop_asm.c"
#else
# include "./enc_translate.c"
# include "./enc_reshuffle.c"
# include "./enc_loop.c"
#endif
#endif // HAVE_AVX2
void
base64_stream_encode_avx2 BASE64_ENC_PARAMS
{
#if HAVE_AVX2
#include "../generic/enc_head.c"
enc_loop_avx2(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_avx2 BASE64_DEC_PARAMS
{
#if HAVE_AVX2
#include "../generic/dec_head.c"
dec_loop_avx2(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View File

@@ -0,0 +1,110 @@
static BASE64_FORCE_INLINE int
dec_loop_avx2_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const __m256i lut_lo = _mm256_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A,
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
const __m256i lut_hi = _mm256_setr_epi8(
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
const __m256i lut_roll = _mm256_setr_epi8(
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0,
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0);
const __m256i mask_2F = _mm256_set1_epi8(0x2F);
// Load input:
__m256i str = _mm256_loadu_si256((__m256i *) *s);
// See the SSSE3 decoder for an explanation of the algorithm.
const __m256i hi_nibbles = _mm256_and_si256(_mm256_srli_epi32(str, 4), mask_2F);
const __m256i lo_nibbles = _mm256_and_si256(str, mask_2F);
const __m256i hi = _mm256_shuffle_epi8(lut_hi, hi_nibbles);
const __m256i lo = _mm256_shuffle_epi8(lut_lo, lo_nibbles);
if (!_mm256_testz_si256(lo, hi)) {
return 0;
}
const __m256i eq_2F = _mm256_cmpeq_epi8(str, mask_2F);
const __m256i roll = _mm256_shuffle_epi8(lut_roll, _mm256_add_epi8(eq_2F, hi_nibbles));
// Now simply add the delta values to the input:
str = _mm256_add_epi8(str, roll);
// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);
// Store the output:
_mm256_storeu_si256((__m256i *) *o, str);
*s += 32;
*o += 24;
*rounds -= 1;
return 1;
}
static inline void
dec_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 45) {
return;
}
// Process blocks of 32 bytes per round. Because 8 extra zero bytes are
// written after the output, ensure that there will be at least 13
// bytes of input data left to cover the gap. (11 data bytes and up to
// two end-of-string markers.)
size_t rounds = (*slen - 13) / 32;
*slen -= rounds * 32; // 32 bytes consumed per round
*olen += rounds * 24; // 24 bytes produced per round
do {
if (rounds >= 8) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 4) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 2) {
if (dec_loop_avx2_inner(s, o, &rounds) &&
dec_loop_avx2_inner(s, o, &rounds)) {
continue;
}
break;
}
dec_loop_avx2_inner(s, o, &rounds);
break;
} while (rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 32;
*olen -= rounds * 24;
}

View File

@@ -0,0 +1,34 @@
static BASE64_FORCE_INLINE __m256i
dec_reshuffle (const __m256i in)
{
// in, lower lane, bits, upper case are most significant bits, lower
// case are least significant bits:
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
const __m256i merge_ab_and_bc = _mm256_maddubs_epi16(in, _mm256_set1_epi32(0x01400140));
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
// 0000eeee FFffffff 0000DDDD DDddEEEE
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
__m256i out = _mm256_madd_epi16(merge_ab_and_bc, _mm256_set1_epi32(0x00011000));
// 00000000 JJJJJJjj KKKKkkkk LLllllll
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
// 00000000 DDDDDDdd EEEEeeee FFffffff
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
// Pack bytes together in each lane:
out = _mm256_shuffle_epi8(out, _mm256_setr_epi8(
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1,
2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, -1, -1, -1, -1));
// 00000000 00000000 00000000 00000000
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
// Pack lanes:
return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32(0, 1, 2, 4, 5, 6, -1, -1));
}

View File

@@ -0,0 +1,89 @@
static BASE64_FORCE_INLINE void
enc_loop_avx2_inner_first (const uint8_t **s, uint8_t **o)
{
// First load is done at s - 0 to not get a segfault:
__m256i src = _mm256_loadu_si256((__m256i *) *s);
// Shift by 4 bytes, as required by enc_reshuffle:
src = _mm256_permutevar8x32_epi32(src, _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6));
// Reshuffle, translate, store:
src = enc_reshuffle(src);
src = enc_translate(src);
_mm256_storeu_si256((__m256i *) *o, src);
// Subsequent loads will be done at s - 4, set pointer for next round:
*s += 20;
*o += 32;
}
static BASE64_FORCE_INLINE void
enc_loop_avx2_inner (const uint8_t **s, uint8_t **o)
{
// Load input:
__m256i src = _mm256_loadu_si256((__m256i *) *s);
// Reshuffle, translate, store:
src = enc_reshuffle(src);
src = enc_translate(src);
_mm256_storeu_si256((__m256i *) *o, src);
*s += 24;
*o += 32;
}
static inline void
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 32) {
return;
}
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
// bytes at a time an offset of -4, ensure that there will be at least
// 4 remaining bytes after the last round, so that the final read will
// not pass beyond the bounds of the input buffer:
size_t rounds = (*slen - 4) / 24;
*slen -= rounds * 24; // 24 bytes consumed per round
*olen += rounds * 32; // 32 bytes produced per round
// The first loop iteration requires special handling to ensure that
// the read, which is done at an offset, does not underflow the buffer:
enc_loop_avx2_inner_first(s, o);
rounds--;
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_avx2_inner(s, o);
enc_loop_avx2_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_avx2_inner(s, o);
break;
}
// Add the offset back:
*s += 4;
}

View File

@@ -0,0 +1,291 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads register R0 from memory. The
// offset at which the register is loaded is set by the given round and a
// constant offset.
#define LOAD(R0, ROUND, OFFSET) \
"vlddqu ("#ROUND" * 24 + "#OFFSET")(%[src]), %["R0"] \n\t"
// Generate a block of inline assembly that deinterleaves and shuffles register
// R0 using preloaded constants. Outputs in R0 and R1.
#define SHUF(R0, R1, R2) \
"vpshufb %[lut0], %["R0"], %["R1"] \n\t" \
"vpand %["R1"], %[msk0], %["R2"] \n\t" \
"vpand %["R1"], %[msk2], %["R1"] \n\t" \
"vpmulhuw %["R2"], %[msk1], %["R2"] \n\t" \
"vpmullw %["R1"], %[msk3], %["R1"] \n\t" \
"vpor %["R1"], %["R2"], %["R1"] \n\t"
// Generate a block of inline assembly that takes R0 and R1 and translates
// their contents to the base64 alphabet, using preloaded constants.
#define TRAN(R0, R1, R2) \
"vpsubusb %[n51], %["R1"], %["R0"] \n\t" \
"vpcmpgtb %[n25], %["R1"], %["R2"] \n\t" \
"vpsubb %["R2"], %["R0"], %["R0"] \n\t" \
"vpshufb %["R0"], %[lut1], %["R2"] \n\t" \
"vpaddb %["R1"], %["R2"], %["R0"] \n\t"
// Generate a block of inline assembly that stores the given register R0 at an
// offset set by the given round.
#define STOR(R0, ROUND) \
"vmovdqu %["R0"], ("#ROUND" * 32)(%[dst]) \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result. Then update
// the source and destination pointers.
#define ROUND() \
LOAD("a", 0, -4) \
SHUF("a", "b", "c") \
TRAN("a", "b", "c") \
STOR("a", 0) \
"add $24, %[src] \n\t" \
"add $32, %[dst] \n\t"
// Define a macro that initiates a three-way interleaved encoding round by
// preloading registers a, b and c from memory.
// The register graph shows which registers are in use during each step, and
// is a visual aid for choosing registers for that step. Symbol index:
//
// + indicates that a register is loaded by that step.
// | indicates that a register is in use and must not be touched.
// - indicates that a register is decommissioned by that step.
// x indicates that a register is used as a temporary by that step.
// V indicates that a register is an input or output to the macro.
//
#define ROUND_3_INIT() /* a b c d e f */ \
LOAD("a", 0, -4) /* + */ \
SHUF("a", "d", "e") /* | + x */ \
LOAD("b", 1, -4) /* | + | */ \
TRAN("a", "d", "e") /* | | - x */ \
LOAD("c", 2, -4) /* V V V */
// Define a macro that translates, shuffles and stores the input registers A, B
// and C, and preloads registers D, E and F for the next round.
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
// and F back into the next round as input registers A, B and C. The macro
// carefully interleaves memory operations with data operations for optimal
// pipelined performance.
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
LOAD(D, (ROUND + 3), -4) /* V V V + */ \
SHUF(B, E, F) /* | | | | + x */ \
STOR(A, (ROUND + 0)) /* - | | | | */ \
TRAN(B, E, F) /* | | | - x */ \
LOAD(E, (ROUND + 4), -4) /* | | | + */ \
SHUF(C, A, F) /* + | | | | x */ \
STOR(B, (ROUND + 1)) /* | - | | | */ \
TRAN(C, A, F) /* - | | | x */ \
LOAD(F, (ROUND + 5), -4) /* | | | + */ \
SHUF(D, A, B) /* + x | | | | */ \
STOR(C, (ROUND + 2)) /* | - | | | */ \
TRAN(D, A, B) /* - x V V V */
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
// registers D, E and F, and translating, shuffling and storing them.
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
SHUF(E, A, B) /* + x V V V */ \
STOR(D, (ROUND + 3)) /* | - | | */ \
TRAN(E, A, B) /* - x | | */ \
SHUF(F, C, D) /* + x | | */ \
STOR(E, (ROUND + 4)) /* | - | */ \
TRAN(F, C, D) /* - x | */ \
STOR(F, (ROUND + 5)) /* - */
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
#define ROUND_3_A(ROUND) \
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
// Define a type B round. Inputs and outputs are swapped with regard to type A.
#define ROUND_3_B(ROUND) \
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
// Terminating macro for a type A round.
#define ROUND_3_A_LAST(ROUND) \
ROUND_3_A(ROUND) \
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
// Terminating macro for a type B round.
#define ROUND_3_B_LAST(ROUND) \
ROUND_3_B(ROUND) \
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_avx2 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// For a clearer explanation of the algorithm used by this function,
// please refer to the plain (not inline assembly) implementation. This
// function follows the same basic logic.
if (*slen < 32) {
return;
}
// Process blocks of 24 bytes at a time. Because blocks are loaded 32
// bytes at a time an offset of -4, ensure that there will be at least
// 4 remaining bytes after the last round, so that the final read will
// not pass beyond the bounds of the input buffer.
size_t rounds = (*slen - 4) / 24;
*slen -= rounds * 24; // 24 bytes consumed per round
*olen += rounds * 32; // 32 bytes produced per round
// Pre-decrement the number of rounds to get the number of rounds
// *after* the first round, which is handled as a special case.
rounds--;
// Number of times to go through the 36x loop.
size_t loops = rounds / 36;
// Number of rounds remaining after the 36x loop.
rounds %= 36;
// Lookup tables.
const __m256i lut0 = _mm256_set_epi8(
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1,
14, 15, 13, 14, 11, 12, 10, 11, 8, 9, 7, 8, 5, 6, 4, 5);
const __m256i lut1 = _mm256_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Temporary registers.
__m256i a, b, c, d, e;
// Temporary register f doubles as the shift mask for the first round.
__m256i f = _mm256_setr_epi32(0, 0, 1, 2, 3, 4, 5, 6);
__asm__ volatile (
// The first loop iteration requires special handling to ensure
// that the read, which is normally done at an offset of -4,
// does not underflow the buffer. Load the buffer at an offset
// of 0 and permute the input to achieve the same effect.
LOAD("a", 0, 0)
"vpermd %[a], %[f], %[a] \n\t"
// Perform the standard shuffling and translation steps.
SHUF("a", "b", "c")
TRAN("a", "b", "c")
// Store the result and increment the source and dest pointers.
"vmovdqu %[a], (%[dst]) \n\t"
"add $24, %[src] \n\t"
"add $32, %[dst] \n\t"
// If there are 36 rounds or more, enter a 36x unrolled loop of
// interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations (table lookups,
// etc) to maximize pipeline throughput.
" test %[loops], %[loops] \n\t"
" jz 18f \n\t"
" jmp 36f \n\t"
" \n\t"
".balign 64 \n\t"
"36: " ROUND_3_INIT()
" " ROUND_3_A( 0)
" " ROUND_3_B( 3)
" " ROUND_3_A( 6)
" " ROUND_3_B( 9)
" " ROUND_3_A(12)
" " ROUND_3_B(15)
" " ROUND_3_A(18)
" " ROUND_3_B(21)
" " ROUND_3_A(24)
" " ROUND_3_B(27)
" " ROUND_3_A_LAST(30)
" add $(24 * 36), %[src] \n\t"
" add $(32 * 36), %[dst] \n\t"
" dec %[loops] \n\t"
" jnz 36b \n\t"
// Enter an 18x unrolled loop for rounds of 18 or more.
"18: cmp $18, %[rounds] \n\t"
" jl 9f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B(3)
" " ROUND_3_A(6)
" " ROUND_3_B(9)
" " ROUND_3_A_LAST(12)
" sub $18, %[rounds] \n\t"
" add $(24 * 18), %[src] \n\t"
" add $(32 * 18), %[dst] \n\t"
// Enter a 9x unrolled loop for rounds of 9 or more.
"9: cmp $9, %[rounds] \n\t"
" jl 6f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B_LAST(3)
" sub $9, %[rounds] \n\t"
" add $(24 * 9), %[src] \n\t"
" add $(32 * 9), %[dst] \n\t"
// Enter a 6x unrolled loop for rounds of 6 or more.
"6: cmp $6, %[rounds] \n\t"
" jl 55f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A_LAST(0)
" sub $6, %[rounds] \n\t"
" add $(24 * 6), %[src] \n\t"
" add $(32 * 6), %[dst] \n\t"
// Dispatch the remaining rounds 0..5.
"55: cmp $3, %[rounds] \n\t"
" jg 45f \n\t"
" je 3f \n\t"
" cmp $1, %[rounds] \n\t"
" jg 2f \n\t"
" je 1f \n\t"
" jmp 0f \n\t"
"45: cmp $4, %[rounds] \n\t"
" je 4f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"5: " ROUND()
"4: " ROUND()
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [rounds] "+r" (rounds),
[loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[a] "=&x" (a),
[b] "=&x" (b),
[c] "=&x" (c),
[d] "=&x" (d),
[e] "=&x" (e),
[f] "+x" (f)
// Inputs (not modified).
: [lut0] "x" (lut0),
[lut1] "x" (lut1),
[msk0] "x" (_mm256_set1_epi32(0x0FC0FC00)),
[msk1] "x" (_mm256_set1_epi32(0x04000040)),
[msk2] "x" (_mm256_set1_epi32(0x003F03F0)),
[msk3] "x" (_mm256_set1_epi32(0x01000010)),
[n51] "x" (_mm256_set1_epi8(51)),
[n25] "x" (_mm256_set1_epi8(25))
// Clobbers.
: "cc", "memory"
);
}
#pragma GCC diagnostic pop

View File

@@ -0,0 +1,83 @@
static BASE64_FORCE_INLINE __m256i
enc_reshuffle (const __m256i input)
{
// Translation of the SSSE3 reshuffling algorithm to AVX2. This one
// works with shifted (4 bytes) input in order to be able to work
// efficiently in the two 128-bit lanes.
// Input, bytes MSB to LSB:
// 0 0 0 0 x w v u t s r q p o n m
// l k j i h g f e d c b a 0 0 0 0
const __m256i in = _mm256_shuffle_epi8(input, _mm256_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1,
14, 15, 13, 14,
11, 12, 10, 11,
8, 9, 7, 8,
5, 6, 4, 5));
// in, bytes MSB to LSB:
// w x v w
// t u s t
// q r p q
// n o m n
// k l j k
// h i g h
// e f d e
// b c a b
const __m256i t0 = _mm256_and_si256(in, _mm256_set1_epi32(0x0FC0FC00));
// bits, upper case are most significant bits, lower case are least
// significant bits.
// 0000wwww XX000000 VVVVVV00 00000000
// 0000tttt UU000000 SSSSSS00 00000000
// 0000qqqq RR000000 PPPPPP00 00000000
// 0000nnnn OO000000 MMMMMM00 00000000
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000
const __m256i t1 = _mm256_mulhi_epu16(t0, _mm256_set1_epi32(0x04000040));
// 00000000 00wwwwXX 00000000 00VVVVVV
// 00000000 00ttttUU 00000000 00SSSSSS
// 00000000 00qqqqRR 00000000 00PPPPPP
// 00000000 00nnnnOO 00000000 00MMMMMM
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA
const __m256i t2 = _mm256_and_si256(in, _mm256_set1_epi32(0x003F03F0));
// 00000000 00xxxxxx 000000vv WWWW0000
// 00000000 00uuuuuu 000000ss TTTT0000
// 00000000 00rrrrrr 000000pp QQQQ0000
// 00000000 00oooooo 000000mm NNNN0000
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000
const __m256i t3 = _mm256_mullo_epi16(t2, _mm256_set1_epi32(0x01000010));
// 00xxxxxx 00000000 00vvWWWW 00000000
// 00uuuuuu 00000000 00ssTTTT 00000000
// 00rrrrrr 00000000 00ppQQQQ 00000000
// 00oooooo 00000000 00mmNNNN 00000000
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000
return _mm256_or_si256(t1, t3);
// 00xxxxxx 00wwwwXX 00vvWWWW 00VVVVVV
// 00uuuuuu 00ttttUU 00ssTTTT 00SSSSSS
// 00rrrrrr 00qqqqRR 00ppQQQQ 00PPPPPP
// 00oooooo 00nnnnOO 00mmNNNN 00MMMMMM
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}

View File

@@ -0,0 +1,30 @@
static BASE64_FORCE_INLINE __m256i
enc_translate (const __m256i in)
{
// A lookup table containing the absolute offsets for all ranges:
const __m256i lut = _mm256_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0,
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from the input. The index for range #0 is right,
// others are 1 less than expected:
__m256i indices = _mm256_subs_epu8(in, _mm256_set1_epi8(51));
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
const __m256i mask = _mm256_cmpgt_epi8(in, _mm256_set1_epi8(25));
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
// now correct:
indices = _mm256_sub_epi8(indices, mask);
// Add offsets to input values:
return _mm256_add_epi8(in, _mm256_shuffle_epi8(lut, indices));
}

View File

@@ -0,0 +1,44 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_AVX512
#include <immintrin.h>
#include "../avx2/dec_reshuffle.c"
#include "../avx2/dec_loop.c"
#include "enc_reshuffle_translate.c"
#include "enc_loop.c"
#endif // HAVE_AVX512
void
base64_stream_encode_avx512 BASE64_ENC_PARAMS
{
#if HAVE_AVX512
#include "../generic/enc_head.c"
enc_loop_avx512(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
// Reuse AVX2 decoding. Not supporting AVX512 at present
int
base64_stream_decode_avx512 BASE64_DEC_PARAMS
{
#if HAVE_AVX512
#include "../generic/dec_head.c"
dec_loop_avx2(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View File

@@ -0,0 +1,61 @@
static BASE64_FORCE_INLINE void
enc_loop_avx512_inner (const uint8_t **s, uint8_t **o)
{
// Load input.
__m512i src = _mm512_loadu_si512((__m512i *) *s);
// Reshuffle, translate, store.
src = enc_reshuffle_translate(src);
_mm512_storeu_si512((__m512i *) *o, src);
*s += 48;
*o += 64;
}
static inline void
enc_loop_avx512 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 64) {
return;
}
// Process blocks of 48 bytes at a time. Because blocks are loaded 64
// bytes at a time, ensure that there will be at least 24 remaining
// bytes after the last round, so that the final read will not pass
// beyond the bounds of the input buffer.
size_t rounds = (*slen - 24) / 48;
*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_avx512_inner(s, o);
enc_loop_avx512_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_avx512_inner(s, o);
break;
}
}

View File

@@ -0,0 +1,50 @@
// AVX512 algorithm is based on permutevar and multishift. The code is based on
// https://github.com/WojciechMula/base64simd which is under BSD-2 license.
static BASE64_FORCE_INLINE __m512i
enc_reshuffle_translate (const __m512i input)
{
// 32-bit input
// [ 0 0 0 0 0 0 0 0|c1 c0 d5 d4 d3 d2 d1 d0|
// b3 b2 b1 b0 c5 c4 c3 c2|a5 a4 a3 a2 a1 a0 b5 b4]
// output order [1, 2, 0, 1]
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
const __m512i shuffle_input = _mm512_setr_epi32(0x01020001,
0x04050304,
0x07080607,
0x0a0b090a,
0x0d0e0c0d,
0x10110f10,
0x13141213,
0x16171516,
0x191a1819,
0x1c1d1b1c,
0x1f201e1f,
0x22232122,
0x25262425,
0x28292728,
0x2b2c2a2b,
0x2e2f2d2e);
// Reorder bytes
// [b3 b2 b1 b0 c5 c4 c3 c2|c1 c0 d5 d4 d3 d2 d1 d0|
// a5 a4 a3 a2 a1 a0 b5 b4|b3 b2 b1 b0 c3 c2 c1 c0]
const __m512i in = _mm512_permutexvar_epi8(shuffle_input, input);
// After multishift a single 32-bit lane has following layout
// [c1 c0 d5 d4 d3 d2 d1 d0|b1 b0 c5 c4 c3 c2 c1 c0|
// a1 a0 b5 b4 b3 b2 b1 b0|d1 d0 a5 a4 a3 a2 a1 a0]
// (a = [10:17], b = [4:11], c = [22:27], d = [16:21])
// 48, 54, 36, 42, 16, 22, 4, 10
const __m512i shifts = _mm512_set1_epi64(0x3036242a1016040alu);
__m512i shuffled_in = _mm512_multishift_epi64_epi8(shifts, in);
// Translate immediately after reshuffled.
const __m512i lookup = _mm512_loadu_si512(base64_table_enc_6bit);
// Translation 6-bit values to ASCII.
return _mm512_permutexvar_epi8(shuffled_in, lookup);
}

View File

@@ -0,0 +1,86 @@
static BASE64_FORCE_INLINE int
dec_loop_generic_32_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const uint32_t str
= base64_table_dec_32bit_d0[(*s)[0]]
| base64_table_dec_32bit_d1[(*s)[1]]
| base64_table_dec_32bit_d2[(*s)[2]]
| base64_table_dec_32bit_d3[(*s)[3]];
#if BASE64_LITTLE_ENDIAN
// LUTs for little-endian set MSB in case of invalid character:
if (str & UINT32_C(0x80000000)) {
return 0;
}
#else
// LUTs for big-endian set LSB in case of invalid character:
if (str & UINT32_C(1)) {
return 0;
}
#endif
// Store the output:
memcpy(*o, &str, sizeof (str));
*s += 4;
*o += 3;
*rounds -= 1;
return 1;
}
static inline void
dec_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 8) {
return;
}
// Process blocks of 4 bytes per round. Because one extra zero byte is
// written after the output, ensure that there will be at least 4 bytes
// of input data left to cover the gap. (Two data bytes and up to two
// end-of-string markers.)
size_t rounds = (*slen - 4) / 4;
*slen -= rounds * 4; // 4 bytes consumed per round
*olen += rounds * 3; // 3 bytes produced per round
do {
if (rounds >= 8) {
if (dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 4) {
if (dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 2) {
if (dec_loop_generic_32_inner(s, o, &rounds) &&
dec_loop_generic_32_inner(s, o, &rounds)) {
continue;
}
break;
}
dec_loop_generic_32_inner(s, o, &rounds);
break;
} while (rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 4;
*olen -= rounds * 3;
}

View File

@@ -0,0 +1,73 @@
static BASE64_FORCE_INLINE void
enc_loop_generic_32_inner (const uint8_t **s, uint8_t **o)
{
uint32_t src;
// Load input:
memcpy(&src, *s, sizeof (src));
// Reorder to 32-bit big-endian, if not already in that format. The
// workset must be in big-endian, otherwise the shifted bits do not
// carry over properly among adjacent bytes:
src = BASE64_HTOBE32(src);
// Two indices for the 12-bit lookup table:
const size_t index0 = (src >> 20) & 0xFFFU;
const size_t index1 = (src >> 8) & 0xFFFU;
// Table lookup and store:
memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
*s += 3;
*o += 4;
}
static inline void
enc_loop_generic_32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 4) {
return;
}
// Process blocks of 3 bytes at a time. Because blocks are loaded 4
// bytes at a time, ensure that there will be at least one remaining
// byte after the last round, so that the final read will not pass
// beyond the bounds of the input buffer:
size_t rounds = (*slen - 1) / 3;
*slen -= rounds * 3; // 3 bytes consumed per round
*olen += rounds * 4; // 4 bytes produced per round
do {
if (rounds >= 8) {
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_generic_32_inner(s, o);
enc_loop_generic_32_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_generic_32_inner(s, o);
break;
} while (rounds > 0);
}

View File

@@ -0,0 +1,77 @@
static BASE64_FORCE_INLINE void
enc_loop_generic_64_inner (const uint8_t **s, uint8_t **o)
{
uint64_t src;
// Load input:
memcpy(&src, *s, sizeof (src));
// Reorder to 64-bit big-endian, if not already in that format. The
// workset must be in big-endian, otherwise the shifted bits do not
// carry over properly among adjacent bytes:
src = BASE64_HTOBE64(src);
// Four indices for the 12-bit lookup table:
const size_t index0 = (src >> 52) & 0xFFFU;
const size_t index1 = (src >> 40) & 0xFFFU;
const size_t index2 = (src >> 28) & 0xFFFU;
const size_t index3 = (src >> 16) & 0xFFFU;
// Table lookup and store:
memcpy(*o + 0, base64_table_enc_12bit + index0, 2);
memcpy(*o + 2, base64_table_enc_12bit + index1, 2);
memcpy(*o + 4, base64_table_enc_12bit + index2, 2);
memcpy(*o + 6, base64_table_enc_12bit + index3, 2);
*s += 6;
*o += 8;
}
static inline void
enc_loop_generic_64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 8) {
return;
}
// Process blocks of 6 bytes at a time. Because blocks are loaded 8
// bytes at a time, ensure that there will be at least 2 remaining
// bytes after the last round, so that the final read will not pass
// beyond the bounds of the input buffer:
size_t rounds = (*slen - 2) / 6;
*slen -= rounds * 6; // 6 bytes consumed per round
*olen += rounds * 8; // 8 bytes produced per round
do {
if (rounds >= 8) {
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_generic_64_inner(s, o);
enc_loop_generic_64_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_generic_64_inner(s, o);
break;
} while (rounds > 0);
}

View File

@@ -0,0 +1,41 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if BASE64_WORDSIZE == 32
# include "32/enc_loop.c"
#elif BASE64_WORDSIZE == 64
# include "64/enc_loop.c"
#endif
#if BASE64_WORDSIZE >= 32
# include "32/dec_loop.c"
#endif
void
base64_stream_encode_plain BASE64_ENC_PARAMS
{
#include "enc_head.c"
#if BASE64_WORDSIZE == 32
enc_loop_generic_32(&s, &slen, &o, &olen);
#elif BASE64_WORDSIZE == 64
enc_loop_generic_64(&s, &slen, &o, &olen);
#endif
#include "enc_tail.c"
}
int
base64_stream_decode_plain BASE64_DEC_PARAMS
{
#include "dec_head.c"
#if BASE64_WORDSIZE >= 32
dec_loop_generic_32(&s, &slen, &o, &olen);
#endif
#include "dec_tail.c"
}

View File

@@ -0,0 +1,37 @@
int ret = 0;
const uint8_t *s = (const uint8_t *) src;
uint8_t *o = (uint8_t *) out;
uint8_t q;
// Use local temporaries to avoid cache thrashing:
size_t olen = 0;
size_t slen = srclen;
struct base64_state st;
st.eof = state->eof;
st.bytes = state->bytes;
st.carry = state->carry;
// If we previously saw an EOF or an invalid character, bail out:
if (st.eof) {
*outlen = 0;
ret = 0;
// If there was a trailing '=' to check, check it:
if (slen && (st.eof == BASE64_AEOF)) {
state->bytes = 0;
state->eof = BASE64_EOF;
ret = ((base64_table_dec_8bit[*s++] == 254) && (slen == 1)) ? 1 : 0;
}
return ret;
}
// Turn four 6-bit numbers into three bytes:
// out[0] = 11111122
// out[1] = 22223333
// out[2] = 33444444
// Duff's device again:
switch (st.bytes)
{
for (;;)
{
case 0:

View File

@@ -0,0 +1,91 @@
if (slen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
st.eof = BASE64_EOF;
// Treat character '=' as invalid for byte 0:
break;
}
st.carry = q << 2;
st.bytes++;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 1: if (slen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
st.eof = BASE64_EOF;
// Treat character '=' as invalid for byte 1:
break;
}
*o++ = st.carry | (q >> 4);
st.carry = q << 4;
st.bytes++;
olen++;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 2: if (slen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
st.bytes++;
// When q == 254, the input char is '='.
// Check if next byte is also '=':
if (q == 254) {
if (slen-- != 0) {
st.bytes = 0;
// EOF:
st.eof = BASE64_EOF;
q = base64_table_dec_8bit[*s++];
ret = ((q == 254) && (slen == 0)) ? 1 : 0;
break;
}
else {
// Almost EOF
st.eof = BASE64_AEOF;
ret = 1;
break;
}
}
// If we get here, there was an error:
break;
}
*o++ = st.carry | (q >> 2);
st.carry = q << 6;
st.bytes++;
olen++;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 3: if (slen-- == 0) {
ret = 1;
break;
}
if ((q = base64_table_dec_8bit[*s++]) >= 254) {
st.bytes = 0;
st.eof = BASE64_EOF;
// When q == 254, the input char is '='. Return 1 and EOF.
// When q == 255, the input char is invalid. Return 0 and EOF.
ret = ((q == 254) && (slen == 0)) ? 1 : 0;
break;
}
*o++ = st.carry | q;
st.carry = 0;
st.bytes = 0;
olen++;
}
}
state->eof = st.eof;
state->bytes = st.bytes;
state->carry = st.carry;
*outlen = olen;
return ret;

View File

@@ -0,0 +1,24 @@
// Assume that *out is large enough to contain the output.
// Theoretically it should be 4/3 the length of src.
const uint8_t *s = (const uint8_t *) src;
uint8_t *o = (uint8_t *) out;
// Use local temporaries to avoid cache thrashing:
size_t olen = 0;
size_t slen = srclen;
struct base64_state st;
st.bytes = state->bytes;
st.carry = state->carry;
// Turn three bytes into four 6-bit numbers:
// in[0] = 00111111
// in[1] = 00112222
// in[2] = 00222233
// in[3] = 00333333
// Duff's device, a for() loop inside a switch() statement. Legal!
switch (st.bytes)
{
for (;;)
{
case 0:

View File

@@ -0,0 +1,34 @@
if (slen-- == 0) {
break;
}
*o++ = base64_table_enc_6bit[*s >> 2];
st.carry = (*s++ << 4) & 0x30;
st.bytes++;
olen += 1;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 1: if (slen-- == 0) {
break;
}
*o++ = base64_table_enc_6bit[st.carry | (*s >> 4)];
st.carry = (*s++ << 2) & 0x3C;
st.bytes++;
olen += 1;
// Deliberate fallthrough:
BASE64_FALLTHROUGH
case 2: if (slen-- == 0) {
break;
}
*o++ = base64_table_enc_6bit[st.carry | (*s >> 6)];
*o++ = base64_table_enc_6bit[*s++ & 0x3F];
st.bytes = 0;
olen += 2;
}
}
state->bytes = st.bytes;
state->carry = st.carry;
*outlen = olen;

View File

@@ -0,0 +1,79 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#ifdef __arm__
# if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32
# define BASE64_USE_NEON32
# endif
#endif
#ifdef BASE64_USE_NEON32
#include <arm_neon.h>
// Only enable inline assembly on supported compilers.
#if defined(__GNUC__) || defined(__clang__)
#define BASE64_NEON32_USE_ASM
#endif
static BASE64_FORCE_INLINE uint8x16_t
vqtbl1q_u8 (const uint8x16_t lut, const uint8x16_t indices)
{
// NEON32 only supports 64-bit wide lookups in 128-bit tables. Emulate
// the NEON64 `vqtbl1q_u8` intrinsic to do 128-bit wide lookups.
uint8x8x2_t lut2;
uint8x8x2_t result;
lut2.val[0] = vget_low_u8(lut);
lut2.val[1] = vget_high_u8(lut);
result.val[0] = vtbl2_u8(lut2, vget_low_u8(indices));
result.val[1] = vtbl2_u8(lut2, vget_high_u8(indices));
return vcombine_u8(result.val[0], result.val[1]);
}
#include "../generic/32/dec_loop.c"
#include "../generic/32/enc_loop.c"
#include "dec_loop.c"
#include "enc_reshuffle.c"
#include "enc_translate.c"
#include "enc_loop.c"
#endif // BASE64_USE_NEON32
// Stride size is so large on these NEON 32-bit functions
// (48 bytes encode, 32 bytes decode) that we inline the
// uint32 codec to stay performant on smaller inputs.
void
base64_stream_encode_neon32 BASE64_ENC_PARAMS
{
#ifdef BASE64_USE_NEON32
#include "../generic/enc_head.c"
enc_loop_neon32(&s, &slen, &o, &olen);
enc_loop_generic_32(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_neon32 BASE64_DEC_PARAMS
{
#ifdef BASE64_USE_NEON32
#include "../generic/dec_head.c"
dec_loop_neon32(&s, &slen, &o, &olen);
dec_loop_generic_32(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View File

@@ -0,0 +1,106 @@
static BASE64_FORCE_INLINE int
is_nonzero (const uint8x16_t v)
{
uint64_t u64;
const uint64x2_t v64 = vreinterpretq_u64_u8(v);
const uint32x2_t v32 = vqmovn_u64(v64);
vst1_u64(&u64, vreinterpret_u64_u32(v32));
return u64 != 0;
}
static BASE64_FORCE_INLINE uint8x16_t
delta_lookup (const uint8x16_t v)
{
const uint8x8_t lut = {
0, 16, 19, 4, (uint8_t) -65, (uint8_t) -65, (uint8_t) -71, (uint8_t) -71,
};
return vcombine_u8(
vtbl1_u8(lut, vget_low_u8(v)),
vtbl1_u8(lut, vget_high_u8(v)));
}
static BASE64_FORCE_INLINE uint8x16_t
dec_loop_neon32_lane (uint8x16_t *lane)
{
// See the SSSE3 decoder for an explanation of the algorithm.
const uint8x16_t lut_lo = {
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A
};
const uint8x16_t lut_hi = {
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
};
const uint8x16_t mask_0F = vdupq_n_u8(0x0F);
const uint8x16_t mask_2F = vdupq_n_u8(0x2F);
const uint8x16_t hi_nibbles = vshrq_n_u8(*lane, 4);
const uint8x16_t lo_nibbles = vandq_u8(*lane, mask_0F);
const uint8x16_t eq_2F = vceqq_u8(*lane, mask_2F);
const uint8x16_t hi = vqtbl1q_u8(lut_hi, hi_nibbles);
const uint8x16_t lo = vqtbl1q_u8(lut_lo, lo_nibbles);
// Now simply add the delta values to the input:
*lane = vaddq_u8(*lane, delta_lookup(vaddq_u8(eq_2F, hi_nibbles)));
// Return the validity mask:
return vandq_u8(lo, hi);
}
static inline void
dec_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 64) {
return;
}
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
// extra trailing zero bytes are written, so it is not necessary to
// reserve extra input bytes:
size_t rounds = *slen / 64;
*slen -= rounds * 64; // 64 bytes consumed per round
*olen += rounds * 48; // 48 bytes produced per round
do {
uint8x16x3_t dec;
// Load 64 bytes and deinterleave:
uint8x16x4_t str = vld4q_u8(*s);
// Decode each lane, collect a mask of invalid inputs:
const uint8x16_t classified
= dec_loop_neon32_lane(&str.val[0])
| dec_loop_neon32_lane(&str.val[1])
| dec_loop_neon32_lane(&str.val[2])
| dec_loop_neon32_lane(&str.val[3]);
// Check for invalid input: if any of the delta values are
// zero, fall back on bytewise code to do error checking and
// reporting:
if (is_nonzero(classified)) {
break;
}
// Compress four bytes into three:
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
// Interleave and store decoded result:
vst3q_u8(*o, dec);
*s += 64;
*o += 48;
} while (--rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 64;
*olen -= rounds * 48;
}

View File

@@ -0,0 +1,170 @@
#ifdef BASE64_NEON32_USE_ASM
static BASE64_FORCE_INLINE void
enc_loop_neon32_inner_asm (const uint8_t **s, uint8_t **o)
{
// This function duplicates the functionality of enc_loop_neon32_inner,
// but entirely with inline assembly. This gives a significant speedup
// over using NEON intrinsics, which do not always generate very good
// code. The logic of the assembly is directly lifted from the
// intrinsics version, so it can be used as a guide to this code.
// Temporary registers, used as scratch space.
uint8x16_t tmp0, tmp1, tmp2, tmp3;
uint8x16_t mask0, mask1, mask2, mask3;
// A lookup table containing the absolute offsets for all ranges.
const uint8x16_t lut = {
65U, 71U, 252U, 252U,
252U, 252U, 252U, 252U,
252U, 252U, 252U, 252U,
237U, 240U, 0U, 0U
};
// Numeric constants.
const uint8x16_t n51 = vdupq_n_u8(51);
const uint8x16_t n25 = vdupq_n_u8(25);
const uint8x16_t n63 = vdupq_n_u8(63);
__asm__ (
// Load 48 bytes and deinterleave. The bytes are loaded to
// hard-coded registers q12, q13 and q14, to ensure that they
// are contiguous. Increment the source pointer.
"vld3.8 {d24, d26, d28}, [%[src]]! \n\t"
"vld3.8 {d25, d27, d29}, [%[src]]! \n\t"
// Reshuffle the bytes using temporaries.
"vshr.u8 %q[t0], q12, #2 \n\t"
"vshr.u8 %q[t1], q13, #4 \n\t"
"vshr.u8 %q[t2], q14, #6 \n\t"
"vsli.8 %q[t1], q12, #4 \n\t"
"vsli.8 %q[t2], q13, #2 \n\t"
"vand.u8 %q[t1], %q[t1], %q[n63] \n\t"
"vand.u8 %q[t2], %q[t2], %q[n63] \n\t"
"vand.u8 %q[t3], q14, %q[n63] \n\t"
// t0..t3 are the reshuffled inputs. Create LUT indices.
"vqsub.u8 q12, %q[t0], %q[n51] \n\t"
"vqsub.u8 q13, %q[t1], %q[n51] \n\t"
"vqsub.u8 q14, %q[t2], %q[n51] \n\t"
"vqsub.u8 q15, %q[t3], %q[n51] \n\t"
// Create the mask for range #0.
"vcgt.u8 %q[m0], %q[t0], %q[n25] \n\t"
"vcgt.u8 %q[m1], %q[t1], %q[n25] \n\t"
"vcgt.u8 %q[m2], %q[t2], %q[n25] \n\t"
"vcgt.u8 %q[m3], %q[t3], %q[n25] \n\t"
// Subtract -1 to correct the LUT indices.
"vsub.u8 q12, %q[m0] \n\t"
"vsub.u8 q13, %q[m1] \n\t"
"vsub.u8 q14, %q[m2] \n\t"
"vsub.u8 q15, %q[m3] \n\t"
// Lookup the delta values.
"vtbl.u8 d24, {%q[lut]}, d24 \n\t"
"vtbl.u8 d25, {%q[lut]}, d25 \n\t"
"vtbl.u8 d26, {%q[lut]}, d26 \n\t"
"vtbl.u8 d27, {%q[lut]}, d27 \n\t"
"vtbl.u8 d28, {%q[lut]}, d28 \n\t"
"vtbl.u8 d29, {%q[lut]}, d29 \n\t"
"vtbl.u8 d30, {%q[lut]}, d30 \n\t"
"vtbl.u8 d31, {%q[lut]}, d31 \n\t"
// Add the delta values.
"vadd.u8 q12, %q[t0] \n\t"
"vadd.u8 q13, %q[t1] \n\t"
"vadd.u8 q14, %q[t2] \n\t"
"vadd.u8 q15, %q[t3] \n\t"
// Store 64 bytes and interleave. Increment the dest pointer.
"vst4.8 {d24, d26, d28, d30}, [%[dst]]! \n\t"
"vst4.8 {d25, d27, d29, d31}, [%[dst]]! \n\t"
// Outputs (modified).
: [src] "+r" (*s),
[dst] "+r" (*o),
[t0] "=&w" (tmp0),
[t1] "=&w" (tmp1),
[t2] "=&w" (tmp2),
[t3] "=&w" (tmp3),
[m0] "=&w" (mask0),
[m1] "=&w" (mask1),
[m2] "=&w" (mask2),
[m3] "=&w" (mask3)
// Inputs (not modified).
: [lut] "w" (lut),
[n25] "w" (n25),
[n51] "w" (n51),
[n63] "w" (n63)
// Clobbers.
: "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31",
"cc", "memory"
);
}
#endif
static BASE64_FORCE_INLINE void
enc_loop_neon32_inner (const uint8_t **s, uint8_t **o)
{
#ifdef BASE64_NEON32_USE_ASM
enc_loop_neon32_inner_asm(s, o);
#else
// Load 48 bytes and deinterleave:
uint8x16x3_t src = vld3q_u8(*s);
// Reshuffle:
uint8x16x4_t out = enc_reshuffle(src);
// Translate reshuffled bytes to the Base64 alphabet:
out = enc_translate(out);
// Interleave and store output:
vst4q_u8(*o, out);
*s += 48;
*o += 64;
#endif
}
static inline void
enc_loop_neon32 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
size_t rounds = *slen / 48;
*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_neon32_inner(s, o);
enc_loop_neon32_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_neon32_inner(s, o);
break;
}
}

View File

@@ -0,0 +1,31 @@
static BASE64_FORCE_INLINE uint8x16x4_t
enc_reshuffle (uint8x16x3_t in)
{
uint8x16x4_t out;
// Input:
// in[0] = a7 a6 a5 a4 a3 a2 a1 a0
// in[1] = b7 b6 b5 b4 b3 b2 b1 b0
// in[2] = c7 c6 c5 c4 c3 c2 c1 c0
// Output:
// out[0] = 00 00 a7 a6 a5 a4 a3 a2
// out[1] = 00 00 a1 a0 b7 b6 b5 b4
// out[2] = 00 00 b3 b2 b1 b0 c7 c6
// out[3] = 00 00 c5 c4 c3 c2 c1 c0
// Move the input bits to where they need to be in the outputs. Except
// for the first output, the high two bits are not cleared.
out.val[0] = vshrq_n_u8(in.val[0], 2);
out.val[1] = vshrq_n_u8(in.val[1], 4);
out.val[2] = vshrq_n_u8(in.val[2], 6);
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
// Clear the high two bits in the second, third and fourth output.
out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F));
return out;
}

View File

@@ -0,0 +1,57 @@
static BASE64_FORCE_INLINE uint8x16x4_t
enc_translate (const uint8x16x4_t in)
{
// A lookup table containing the absolute offsets for all ranges:
const uint8x16_t lut = {
65U, 71U, 252U, 252U,
252U, 252U, 252U, 252U,
252U, 252U, 252U, 252U,
237U, 240U, 0U, 0U
};
const uint8x16_t offset = vdupq_n_u8(51);
uint8x16x4_t indices, mask, delta, out;
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from input:
// the index for range #0 is right, others are 1 less than expected:
indices.val[0] = vqsubq_u8(in.val[0], offset);
indices.val[1] = vqsubq_u8(in.val[1], offset);
indices.val[2] = vqsubq_u8(in.val[2], offset);
indices.val[3] = vqsubq_u8(in.val[3], offset);
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
mask.val[0] = vcgtq_u8(in.val[0], vdupq_n_u8(25));
mask.val[1] = vcgtq_u8(in.val[1], vdupq_n_u8(25));
mask.val[2] = vcgtq_u8(in.val[2], vdupq_n_u8(25));
mask.val[3] = vcgtq_u8(in.val[3], vdupq_n_u8(25));
// Subtract -1, so add 1 to indices for range #[1..4], All indices are
// now correct:
indices.val[0] = vsubq_u8(indices.val[0], mask.val[0]);
indices.val[1] = vsubq_u8(indices.val[1], mask.val[1]);
indices.val[2] = vsubq_u8(indices.val[2], mask.val[2]);
indices.val[3] = vsubq_u8(indices.val[3], mask.val[3]);
// Lookup delta values:
delta.val[0] = vqtbl1q_u8(lut, indices.val[0]);
delta.val[1] = vqtbl1q_u8(lut, indices.val[1]);
delta.val[2] = vqtbl1q_u8(lut, indices.val[2]);
delta.val[3] = vqtbl1q_u8(lut, indices.val[3]);
// Add delta values:
out.val[0] = vaddq_u8(in.val[0], delta.val[0]);
out.val[1] = vaddq_u8(in.val[1], delta.val[1]);
out.val[2] = vaddq_u8(in.val[2], delta.val[2]);
out.val[3] = vaddq_u8(in.val[3], delta.val[3]);
return out;
}

View File

@@ -0,0 +1,93 @@
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_NEON64
#include <arm_neon.h>
// Only enable inline assembly on supported compilers.
#if defined(__GNUC__) || defined(__clang__)
#define BASE64_NEON64_USE_ASM
#endif
static BASE64_FORCE_INLINE uint8x16x4_t
load_64byte_table (const uint8_t *p)
{
#ifdef BASE64_NEON64_USE_ASM
// Force the table to be loaded into contiguous registers. GCC will not
// normally allocate contiguous registers for a `uint8x16x4_t'. These
// registers are chosen to not conflict with the ones in the enc loop.
register uint8x16_t t0 __asm__ ("v8");
register uint8x16_t t1 __asm__ ("v9");
register uint8x16_t t2 __asm__ ("v10");
register uint8x16_t t3 __asm__ ("v11");
__asm__ (
"ld1 {%[t0].16b, %[t1].16b, %[t2].16b, %[t3].16b}, [%[src]], #64 \n\t"
: [src] "+r" (p),
[t0] "=w" (t0),
[t1] "=w" (t1),
[t2] "=w" (t2),
[t3] "=w" (t3)
);
return (uint8x16x4_t) {
.val[0] = t0,
.val[1] = t1,
.val[2] = t2,
.val[3] = t3,
};
#else
return vld1q_u8_x4(p);
#endif
}
#include "../generic/32/dec_loop.c"
#include "../generic/64/enc_loop.c"
#include "dec_loop.c"
#ifdef BASE64_NEON64_USE_ASM
# include "enc_loop_asm.c"
#else
# include "enc_reshuffle.c"
# include "enc_loop.c"
#endif
#endif // HAVE_NEON64
// Stride size is so large on these NEON 64-bit functions
// (48 bytes encode, 64 bytes decode) that we inline the
// uint64 codec to stay performant on smaller inputs.
void
base64_stream_encode_neon64 BASE64_ENC_PARAMS
{
#if HAVE_NEON64
#include "../generic/enc_head.c"
enc_loop_neon64(&s, &slen, &o, &olen);
enc_loop_generic_64(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_neon64 BASE64_DEC_PARAMS
{
#if HAVE_NEON64
#include "../generic/dec_head.c"
dec_loop_neon64(&s, &slen, &o, &olen);
dec_loop_generic_32(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View File

@@ -0,0 +1,129 @@
// The input consists of five valid character sets in the Base64 alphabet,
// which we need to map back to the 6-bit values they represent.
// There are three ranges, two singles, and then there's the rest.
//
// # From To LUT Characters
// 1 [0..42] [255] #1 invalid input
// 2 [43] [62] #1 +
// 3 [44..46] [255] #1 invalid input
// 4 [47] [63] #1 /
// 5 [48..57] [52..61] #1 0..9
// 6 [58..63] [255] #1 invalid input
// 7 [64] [255] #2 invalid input
// 8 [65..90] [0..25] #2 A..Z
// 9 [91..96] [255] #2 invalid input
// 10 [97..122] [26..51] #2 a..z
// 11 [123..126] [255] #2 invalid input
// (12) Everything else => invalid input
// The first LUT will use the VTBL instruction (out of range indices are set to
// 0 in destination).
static const uint8_t dec_lut1[] = {
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U,
255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 255U, 62U, 255U, 255U, 255U, 63U,
52U, 53U, 54U, 55U, 56U, 57U, 58U, 59U, 60U, 61U, 255U, 255U, 255U, 255U, 255U, 255U,
};
// The second LUT will use the VTBX instruction (out of range indices will be
// unchanged in destination). Input [64..126] will be mapped to index [1..63]
// in this LUT. Index 0 means that value comes from LUT #1.
static const uint8_t dec_lut2[] = {
0U, 255U, 0U, 1U, 2U, 3U, 4U, 5U, 6U, 7U, 8U, 9U, 10U, 11U, 12U, 13U,
14U, 15U, 16U, 17U, 18U, 19U, 20U, 21U, 22U, 23U, 24U, 25U, 255U, 255U, 255U, 255U,
255U, 255U, 26U, 27U, 28U, 29U, 30U, 31U, 32U, 33U, 34U, 35U, 36U, 37U, 38U, 39U,
40U, 41U, 42U, 43U, 44U, 45U, 46U, 47U, 48U, 49U, 50U, 51U, 255U, 255U, 255U, 255U,
};
// All input values in range for the first look-up will be 0U in the second
// look-up result. All input values out of range for the first look-up will be
// 0U in the first look-up result. Thus, the two results can be ORed without
// conflicts.
//
// Invalid characters that are in the valid range for either look-up will be
// set to 255U in the combined result. Other invalid characters will just be
// passed through with the second look-up result (using the VTBX instruction).
// Since the second LUT is 64 bytes, those passed-through values are guaranteed
// to have a value greater than 63U. Therefore, valid characters will be mapped
// to the valid [0..63] range and all invalid characters will be mapped to
// values greater than 63.
static inline void
dec_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 64) {
return;
}
// Process blocks of 64 bytes per round. Unlike the SSE codecs, no
// extra trailing zero bytes are written, so it is not necessary to
// reserve extra input bytes:
size_t rounds = *slen / 64;
*slen -= rounds * 64; // 64 bytes consumed per round
*olen += rounds * 48; // 48 bytes produced per round
const uint8x16x4_t tbl_dec1 = load_64byte_table(dec_lut1);
const uint8x16x4_t tbl_dec2 = load_64byte_table(dec_lut2);
do {
const uint8x16_t offset = vdupq_n_u8(63U);
uint8x16x4_t dec1, dec2;
uint8x16x3_t dec;
// Load 64 bytes and deinterleave:
uint8x16x4_t str = vld4q_u8((uint8_t *) *s);
// Get indices for second LUT:
dec2.val[0] = vqsubq_u8(str.val[0], offset);
dec2.val[1] = vqsubq_u8(str.val[1], offset);
dec2.val[2] = vqsubq_u8(str.val[2], offset);
dec2.val[3] = vqsubq_u8(str.val[3], offset);
// Get values from first LUT:
dec1.val[0] = vqtbl4q_u8(tbl_dec1, str.val[0]);
dec1.val[1] = vqtbl4q_u8(tbl_dec1, str.val[1]);
dec1.val[2] = vqtbl4q_u8(tbl_dec1, str.val[2]);
dec1.val[3] = vqtbl4q_u8(tbl_dec1, str.val[3]);
// Get values from second LUT:
dec2.val[0] = vqtbx4q_u8(dec2.val[0], tbl_dec2, dec2.val[0]);
dec2.val[1] = vqtbx4q_u8(dec2.val[1], tbl_dec2, dec2.val[1]);
dec2.val[2] = vqtbx4q_u8(dec2.val[2], tbl_dec2, dec2.val[2]);
dec2.val[3] = vqtbx4q_u8(dec2.val[3], tbl_dec2, dec2.val[3]);
// Get final values:
str.val[0] = vorrq_u8(dec1.val[0], dec2.val[0]);
str.val[1] = vorrq_u8(dec1.val[1], dec2.val[1]);
str.val[2] = vorrq_u8(dec1.val[2], dec2.val[2]);
str.val[3] = vorrq_u8(dec1.val[3], dec2.val[3]);
// Check for invalid input, any value larger than 63:
const uint8x16_t classified
= vorrq_u8(
vorrq_u8(vcgtq_u8(str.val[0], vdupq_n_u8(63)), vcgtq_u8(str.val[1], vdupq_n_u8(63))),
vorrq_u8(vcgtq_u8(str.val[2], vdupq_n_u8(63)), vcgtq_u8(str.val[3], vdupq_n_u8(63)))
);
// Check that all bits are zero:
if (vmaxvq_u8(classified) != 0U) {
break;
}
// Compress four bytes into three:
dec.val[0] = vorrq_u8(vshlq_n_u8(str.val[0], 2), vshrq_n_u8(str.val[1], 4));
dec.val[1] = vorrq_u8(vshlq_n_u8(str.val[1], 4), vshrq_n_u8(str.val[2], 2));
dec.val[2] = vorrq_u8(vshlq_n_u8(str.val[2], 6), str.val[3]);
// Interleave and store decoded result:
vst3q_u8((uint8_t *) *o, dec);
*s += 64;
*o += 48;
} while (--rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 64;
*olen -= rounds * 48;
}

View File

@@ -0,0 +1,66 @@
static BASE64_FORCE_INLINE void
enc_loop_neon64_inner (const uint8_t **s, uint8_t **o, const uint8x16x4_t tbl_enc)
{
// Load 48 bytes and deinterleave:
uint8x16x3_t src = vld3q_u8(*s);
// Divide bits of three input bytes over four output bytes:
uint8x16x4_t out = enc_reshuffle(src);
// The bits have now been shifted to the right locations;
// translate their values 0..63 to the Base64 alphabet.
// Use a 64-byte table lookup:
out.val[0] = vqtbl4q_u8(tbl_enc, out.val[0]);
out.val[1] = vqtbl4q_u8(tbl_enc, out.val[1]);
out.val[2] = vqtbl4q_u8(tbl_enc, out.val[2]);
out.val[3] = vqtbl4q_u8(tbl_enc, out.val[3]);
// Interleave and store output:
vst4q_u8(*o, out);
*s += 48;
*o += 64;
}
static inline void
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
size_t rounds = *slen / 48;
*slen -= rounds * 48; // 48 bytes consumed per round
*olen += rounds * 64; // 64 bytes produced per round
// Load the encoding table:
const uint8x16x4_t tbl_enc = load_64byte_table(base64_table_enc_6bit);
while (rounds > 0) {
if (rounds >= 8) {
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_neon64_inner(s, o, tbl_enc);
enc_loop_neon64_inner(s, o, tbl_enc);
rounds -= 2;
continue;
}
enc_loop_neon64_inner(s, o, tbl_enc);
break;
}
}

View File

@@ -0,0 +1,168 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads three user-defined registers
// A, B, C from memory and deinterleaves them, post-incrementing the src
// pointer. The register set should be sequential.
#define LOAD(A, B, C) \
"ld3 {"A".16b, "B".16b, "C".16b}, [%[src]], #48 \n\t"
// Generate a block of inline assembly that takes three deinterleaved registers
// and shuffles the bytes. The output is in temporary registers t0..t3.
#define SHUF(A, B, C) \
"ushr %[t0].16b, "A".16b, #2 \n\t" \
"ushr %[t1].16b, "B".16b, #4 \n\t" \
"ushr %[t2].16b, "C".16b, #6 \n\t" \
"sli %[t1].16b, "A".16b, #4 \n\t" \
"sli %[t2].16b, "B".16b, #2 \n\t" \
"and %[t1].16b, %[t1].16b, %[n63].16b \n\t" \
"and %[t2].16b, %[t2].16b, %[n63].16b \n\t" \
"and %[t3].16b, "C".16b, %[n63].16b \n\t"
// Generate a block of inline assembly that takes temporary registers t0..t3
// and translates them to the base64 alphabet, using a table loaded into
// v8..v11. The output is in user-defined registers A..D.
#define TRAN(A, B, C, D) \
"tbl "A".16b, {v8.16b-v11.16b}, %[t0].16b \n\t" \
"tbl "B".16b, {v8.16b-v11.16b}, %[t1].16b \n\t" \
"tbl "C".16b, {v8.16b-v11.16b}, %[t2].16b \n\t" \
"tbl "D".16b, {v8.16b-v11.16b}, %[t3].16b \n\t"
// Generate a block of inline assembly that interleaves four registers and
// stores them, post-incrementing the destination pointer.
#define STOR(A, B, C, D) \
"st4 {"A".16b, "B".16b, "C".16b, "D".16b}, [%[dst]], #64 \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result.
#define ROUND() \
LOAD("v12", "v13", "v14") \
SHUF("v12", "v13", "v14") \
TRAN("v12", "v13", "v14", "v15") \
STOR("v12", "v13", "v14", "v15")
// Generate a block of assembly that generates a type A interleaved encoder
// round. It uses registers that were loaded by the previous type B round, and
// in turn loads registers for the next type B round.
#define ROUND_A() \
SHUF("v2", "v3", "v4") \
LOAD("v12", "v13", "v14") \
TRAN("v2", "v3", "v4", "v5") \
STOR("v2", "v3", "v4", "v5")
// Type B interleaved encoder round. Same as type A, but register sets swapped.
#define ROUND_B() \
SHUF("v12", "v13", "v14") \
LOAD("v2", "v3", "v4") \
TRAN("v12", "v13", "v14", "v15") \
STOR("v12", "v13", "v14", "v15")
// The first type A round needs to load its own registers.
#define ROUND_A_FIRST() \
LOAD("v2", "v3", "v4") \
ROUND_A()
// The last type B round omits the load for the next step.
#define ROUND_B_LAST() \
SHUF("v12", "v13", "v14") \
TRAN("v12", "v13", "v14", "v15") \
STOR("v12", "v13", "v14", "v15")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_neon64 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
size_t rounds = *slen / 48;
if (rounds == 0) {
return;
}
*slen -= rounds * 48; // 48 bytes consumed per round.
*olen += rounds * 64; // 64 bytes produced per round.
// Number of times to go through the 8x loop.
size_t loops = rounds / 8;
// Number of rounds remaining after the 8x loop.
rounds %= 8;
// Temporary registers, used as scratch space.
uint8x16_t tmp0, tmp1, tmp2, tmp3;
__asm__ volatile (
// Load the encoding table into v8..v11.
" ld1 {v8.16b-v11.16b}, [%[tbl]] \n\t"
// If there are eight rounds or more, enter an 8x unrolled loop
// of interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations to maximize
// pipeline throughput.
" cbz %[loops], 4f \n\t"
// The SIMD instructions do not touch the flags.
"88: subs %[loops], %[loops], #1 \n\t"
" " ROUND_A_FIRST()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B_LAST()
" b.ne 88b \n\t"
// Enter a 4x unrolled loop for rounds of 4 or more.
"4: cmp %[rounds], #4 \n\t"
" b.lt 30f \n\t"
" " ROUND_A_FIRST()
" " ROUND_B()
" " ROUND_A()
" " ROUND_B_LAST()
" sub %[rounds], %[rounds], #4 \n\t"
// Dispatch the remaining rounds 0..3.
"30: cbz %[rounds], 0f \n\t"
" cmp %[rounds], #2 \n\t"
" b.eq 2f \n\t"
" b.lt 1f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[t0] "=&w" (tmp0),
[t1] "=&w" (tmp1),
[t2] "=&w" (tmp2),
[t3] "=&w" (tmp3)
// Inputs (not modified).
: [rounds] "r" (rounds),
[tbl] "r" (base64_table_enc_6bit),
[n63] "w" (vdupq_n_u8(63))
// Clobbers.
: "v2", "v3", "v4", "v5",
"v8", "v9", "v10", "v11",
"v12", "v13", "v14", "v15",
"cc", "memory"
);
}
#pragma GCC diagnostic pop

View File

@@ -0,0 +1,31 @@
static BASE64_FORCE_INLINE uint8x16x4_t
enc_reshuffle (const uint8x16x3_t in)
{
uint8x16x4_t out;
// Input:
// in[0] = a7 a6 a5 a4 a3 a2 a1 a0
// in[1] = b7 b6 b5 b4 b3 b2 b1 b0
// in[2] = c7 c6 c5 c4 c3 c2 c1 c0
// Output:
// out[0] = 00 00 a7 a6 a5 a4 a3 a2
// out[1] = 00 00 a1 a0 b7 b6 b5 b4
// out[2] = 00 00 b3 b2 b1 b0 c7 c6
// out[3] = 00 00 c5 c4 c3 c2 c1 c0
// Move the input bits to where they need to be in the outputs. Except
// for the first output, the high two bits are not cleared.
out.val[0] = vshrq_n_u8(in.val[0], 2);
out.val[1] = vshrq_n_u8(in.val[1], 4);
out.val[2] = vshrq_n_u8(in.val[2], 6);
out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 4);
out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 2);
// Clear the high two bits in the second, third and fourth output.
out.val[1] = vandq_u8(out.val[1], vdupq_n_u8(0x3F));
out.val[2] = vandq_u8(out.val[2], vdupq_n_u8(0x3F));
out.val[3] = vandq_u8(in.val[2], vdupq_n_u8(0x3F));
return out;
}

View File

@@ -0,0 +1,58 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_SSE41
#include <smmintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_SSE41_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_SSE41_USE_ASM 1
# else
# define BASE64_SSE41_USE_ASM 0
# endif
#endif
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/dec_loop.c"
#if BASE64_SSE41_USE_ASM
# include "../ssse3/enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
# include "../ssse3/enc_loop.c"
#endif
#endif // HAVE_SSE41
void
base64_stream_encode_sse41 BASE64_ENC_PARAMS
{
#if HAVE_SSE41
#include "../generic/enc_head.c"
enc_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_sse41 BASE64_DEC_PARAMS
{
#if HAVE_SSE41
#include "../generic/dec_head.c"
dec_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View File

@@ -0,0 +1,58 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_SSE42
#include <nmmintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
#ifndef BASE64_SSE42_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_SSE42_USE_ASM 1
# else
# define BASE64_SSE42_USE_ASM 0
# endif
#endif
#include "../ssse3/dec_reshuffle.c"
#include "../ssse3/dec_loop.c"
#if BASE64_SSE42_USE_ASM
# include "../ssse3/enc_loop_asm.c"
#else
# include "../ssse3/enc_translate.c"
# include "../ssse3/enc_reshuffle.c"
# include "../ssse3/enc_loop.c"
#endif
#endif // HAVE_SSE42
void
base64_stream_encode_sse42 BASE64_ENC_PARAMS
{
#if HAVE_SSE42
#include "../generic/enc_head.c"
enc_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_sse42 BASE64_DEC_PARAMS
{
#if HAVE_SSE42
#include "../generic/dec_head.c"
dec_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View File

@@ -0,0 +1,60 @@
#include <stdint.h>
#include <stddef.h>
#include <stdlib.h>
#include "libbase64.h"
#include "../../tables/tables.h"
#include "../../codecs.h"
#include "config.h"
#include "../../env.h"
#if HAVE_SSSE3
#include <tmmintrin.h>
// Only enable inline assembly on supported compilers and on 64-bit CPUs.
// 32-bit CPUs with SSSE3 support, such as low-end Atoms, only have eight XMM
// registers, which is not enough to run the inline assembly.
#ifndef BASE64_SSSE3_USE_ASM
# if (defined(__GNUC__) || defined(__clang__)) && BASE64_WORDSIZE == 64
# define BASE64_SSSE3_USE_ASM 1
# else
# define BASE64_SSSE3_USE_ASM 0
# endif
#endif
#include "dec_reshuffle.c"
#include "dec_loop.c"
#if BASE64_SSSE3_USE_ASM
# include "enc_loop_asm.c"
#else
# include "enc_reshuffle.c"
# include "enc_translate.c"
# include "enc_loop.c"
#endif
#endif // HAVE_SSSE3
void
base64_stream_encode_ssse3 BASE64_ENC_PARAMS
{
#if HAVE_SSSE3
#include "../generic/enc_head.c"
enc_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/enc_tail.c"
#else
base64_enc_stub(state, src, srclen, out, outlen);
#endif
}
int
base64_stream_decode_ssse3 BASE64_DEC_PARAMS
{
#if HAVE_SSSE3
#include "../generic/dec_head.c"
dec_loop_ssse3(&s, &slen, &o, &olen);
#include "../generic/dec_tail.c"
#else
return base64_dec_stub(state, src, srclen, out, outlen);
#endif
}

View File

@@ -0,0 +1,173 @@
// The input consists of six character sets in the Base64 alphabet, which we
// need to map back to the 6-bit values they represent. There are three ranges,
// two singles, and then there's the rest.
//
// # From To Add Characters
// 1 [43] [62] +19 +
// 2 [47] [63] +16 /
// 3 [48..57] [52..61] +4 0..9
// 4 [65..90] [0..25] -65 A..Z
// 5 [97..122] [26..51] -71 a..z
// (6) Everything else => invalid input
//
// We will use lookup tables for character validation and offset computation.
// Remember that 0x2X and 0x0X are the same index for _mm_shuffle_epi8, this
// allows to mask with 0x2F instead of 0x0F and thus save one constant
// declaration (register and/or memory access).
//
// For offsets:
// Perfect hash for lut = ((src >> 4) & 0x2F) + ((src == 0x2F) ? 0xFF : 0x00)
// 0000 = garbage
// 0001 = /
// 0010 = +
// 0011 = 0-9
// 0100 = A-Z
// 0101 = A-Z
// 0110 = a-z
// 0111 = a-z
// 1000 >= garbage
//
// For validation, here's the table.
// A character is valid if and only if the AND of the 2 lookups equals 0:
//
// hi \ lo 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111
// LUT 0x15 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x11 0x13 0x1A 0x1B 0x1B 0x1B 0x1A
//
// 0000 0x10 char NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0001 0x10 char DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
// andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
//
// 0010 0x01 char ! " # $ % & ' ( ) * + , - . /
// andlut 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x01 0x00 0x01 0x01 0x01 0x00
//
// 0011 0x02 char 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x02 0x02 0x02 0x02 0x02 0x02
//
// 0100 0x04 char @ A B C D E F G H I J K L M N O
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
//
// 0101 0x08 char P Q R S T U V W X Y Z [ \ ] ^ _
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
//
// 0110 0x04 char ` a b c d e f g h i j k l m n o
// andlut 0x04 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00
// 0111 0x08 char p q r s t u v w x y z { | } ~
// andlut 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x00 0x08 0x08 0x08 0x08 0x08
//
// 1000 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1001 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1010 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1011 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1100 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1101 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1110 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
// 1111 0x10 andlut 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10 0x10
static BASE64_FORCE_INLINE int
dec_loop_ssse3_inner (const uint8_t **s, uint8_t **o, size_t *rounds)
{
const __m128i lut_lo = _mm_setr_epi8(
0x15, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11, 0x11,
0x11, 0x11, 0x13, 0x1A, 0x1B, 0x1B, 0x1B, 0x1A);
const __m128i lut_hi = _mm_setr_epi8(
0x10, 0x10, 0x01, 0x02, 0x04, 0x08, 0x04, 0x08,
0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10);
const __m128i lut_roll = _mm_setr_epi8(
0, 16, 19, 4, -65, -65, -71, -71,
0, 0, 0, 0, 0, 0, 0, 0);
const __m128i mask_2F = _mm_set1_epi8(0x2F);
// Load input:
__m128i str = _mm_loadu_si128((__m128i *) *s);
// Table lookups:
const __m128i hi_nibbles = _mm_and_si128(_mm_srli_epi32(str, 4), mask_2F);
const __m128i lo_nibbles = _mm_and_si128(str, mask_2F);
const __m128i hi = _mm_shuffle_epi8(lut_hi, hi_nibbles);
const __m128i lo = _mm_shuffle_epi8(lut_lo, lo_nibbles);
// Check for invalid input: if any "and" values from lo and hi are not
// zero, fall back on bytewise code to do error checking and reporting:
if (_mm_movemask_epi8(_mm_cmpgt_epi8(_mm_and_si128(lo, hi), _mm_setzero_si128())) != 0) {
return 0;
}
const __m128i eq_2F = _mm_cmpeq_epi8(str, mask_2F);
const __m128i roll = _mm_shuffle_epi8(lut_roll, _mm_add_epi8(eq_2F, hi_nibbles));
// Now simply add the delta values to the input:
str = _mm_add_epi8(str, roll);
// Reshuffle the input to packed 12-byte output format:
str = dec_reshuffle(str);
// Store the output:
_mm_storeu_si128((__m128i *) *o, str);
*s += 16;
*o += 12;
*rounds -= 1;
return 1;
}
static inline void
dec_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 24) {
return;
}
// Process blocks of 16 bytes per round. Because 4 extra zero bytes are
// written after the output, ensure that there will be at least 8 bytes
// of input data left to cover the gap. (6 data bytes and up to two
// end-of-string markers.)
size_t rounds = (*slen - 8) / 16;
*slen -= rounds * 16; // 16 bytes consumed per round
*olen += rounds * 12; // 12 bytes produced per round
do {
if (rounds >= 8) {
if (dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 4) {
if (dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds)) {
continue;
}
break;
}
if (rounds >= 2) {
if (dec_loop_ssse3_inner(s, o, &rounds) &&
dec_loop_ssse3_inner(s, o, &rounds)) {
continue;
}
break;
}
dec_loop_ssse3_inner(s, o, &rounds);
break;
} while (rounds > 0);
// Adjust for any rounds that were skipped:
*slen += rounds * 16;
*olen -= rounds * 12;
}

View File

@@ -0,0 +1,33 @@
static BASE64_FORCE_INLINE __m128i
dec_reshuffle (const __m128i in)
{
// in, bits, upper case are most significant bits, lower case are least significant bits
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
const __m128i merge_ab_and_bc = _mm_maddubs_epi16(in, _mm_set1_epi32(0x01400140));
// 0000kkkk LLllllll 0000JJJJ JJjjKKKK
// 0000hhhh IIiiiiii 0000GGGG GGggHHHH
// 0000eeee FFffffff 0000DDDD DDddEEEE
// 0000bbbb CCcccccc 0000AAAA AAaaBBBB
const __m128i out = _mm_madd_epi16(merge_ab_and_bc, _mm_set1_epi32(0x00011000));
// 00000000 JJJJJJjj KKKKkkkk LLllllll
// 00000000 GGGGGGgg HHHHhhhh IIiiiiii
// 00000000 DDDDDDdd EEEEeeee FFffffff
// 00000000 AAAAAAaa BBBBbbbb CCcccccc
// Pack bytes together:
return _mm_shuffle_epi8(out, _mm_setr_epi8(
2, 1, 0,
6, 5, 4,
10, 9, 8,
14, 13, 12,
-1, -1, -1, -1));
// 00000000 00000000 00000000 00000000
// LLllllll KKKKkkkk JJJJJJjj IIiiiiii
// HHHHhhhh GGGGGGgg FFffffff EEEEeeee
// DDDDDDdd CCcccccc BBBBbbbb AAAAAAaa
}

View File

@@ -0,0 +1,67 @@
static BASE64_FORCE_INLINE void
enc_loop_ssse3_inner (const uint8_t **s, uint8_t **o)
{
// Load input:
__m128i str = _mm_loadu_si128((__m128i *) *s);
// Reshuffle:
str = enc_reshuffle(str);
// Translate reshuffled bytes to the Base64 alphabet:
str = enc_translate(str);
// Store:
_mm_storeu_si128((__m128i *) *o, str);
*s += 12;
*o += 16;
}
static inline void
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
if (*slen < 16) {
return;
}
// Process blocks of 12 bytes at a time. Because blocks are loaded 16
// bytes at a time, ensure that there will be at least 4 remaining
// bytes after the last round, so that the final read will not pass
// beyond the bounds of the input buffer:
size_t rounds = (*slen - 4) / 12;
*slen -= rounds * 12; // 12 bytes consumed per round
*olen += rounds * 16; // 16 bytes produced per round
do {
if (rounds >= 8) {
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
rounds -= 8;
continue;
}
if (rounds >= 4) {
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
rounds -= 4;
continue;
}
if (rounds >= 2) {
enc_loop_ssse3_inner(s, o);
enc_loop_ssse3_inner(s, o);
rounds -= 2;
continue;
}
enc_loop_ssse3_inner(s, o);
break;
} while (rounds > 0);
}

View File

@@ -0,0 +1,268 @@
// Apologies in advance for combining the preprocessor with inline assembly,
// two notoriously gnarly parts of C, but it was necessary to avoid a lot of
// code repetition. The preprocessor is used to template large sections of
// inline assembly that differ only in the registers used. If the code was
// written out by hand, it would become very large and hard to audit.
// Generate a block of inline assembly that loads register R0 from memory. The
// offset at which the register is loaded is set by the given round.
#define LOAD(R0, ROUND) \
"lddqu ("#ROUND" * 12)(%[src]), %["R0"] \n\t"
// Generate a block of inline assembly that deinterleaves and shuffles register
// R0 using preloaded constants. Outputs in R0 and R1.
#define SHUF(R0, R1) \
"pshufb %[lut0], %["R0"] \n\t" \
"movdqa %["R0"], %["R1"] \n\t" \
"pand %[msk0], %["R0"] \n\t" \
"pand %[msk2], %["R1"] \n\t" \
"pmulhuw %[msk1], %["R0"] \n\t" \
"pmullw %[msk3], %["R1"] \n\t" \
"por %["R1"], %["R0"] \n\t"
// Generate a block of inline assembly that takes R0 and R1 and translates
// their contents to the base64 alphabet, using preloaded constants.
#define TRAN(R0, R1, R2) \
"movdqa %["R0"], %["R1"] \n\t" \
"movdqa %["R0"], %["R2"] \n\t" \
"psubusb %[n51], %["R1"] \n\t" \
"pcmpgtb %[n25], %["R2"] \n\t" \
"psubb %["R2"], %["R1"] \n\t" \
"movdqa %[lut1], %["R2"] \n\t" \
"pshufb %["R1"], %["R2"] \n\t" \
"paddb %["R2"], %["R0"] \n\t"
// Generate a block of inline assembly that stores the given register R0 at an
// offset set by the given round.
#define STOR(R0, ROUND) \
"movdqu %["R0"], ("#ROUND" * 16)(%[dst]) \n\t"
// Generate a block of inline assembly that generates a single self-contained
// encoder round: fetch the data, process it, and store the result. Then update
// the source and destination pointers.
#define ROUND() \
LOAD("a", 0) \
SHUF("a", "b") \
TRAN("a", "b", "c") \
STOR("a", 0) \
"add $12, %[src] \n\t" \
"add $16, %[dst] \n\t"
// Define a macro that initiates a three-way interleaved encoding round by
// preloading registers a, b and c from memory.
// The register graph shows which registers are in use during each step, and
// is a visual aid for choosing registers for that step. Symbol index:
//
// + indicates that a register is loaded by that step.
// | indicates that a register is in use and must not be touched.
// - indicates that a register is decommissioned by that step.
// x indicates that a register is used as a temporary by that step.
// V indicates that a register is an input or output to the macro.
//
#define ROUND_3_INIT() /* a b c d e f */ \
LOAD("a", 0) /* + */ \
SHUF("a", "d") /* | + */ \
LOAD("b", 1) /* | + | */ \
TRAN("a", "d", "e") /* | | - x */ \
LOAD("c", 2) /* V V V */
// Define a macro that translates, shuffles and stores the input registers A, B
// and C, and preloads registers D, E and F for the next round.
// This macro can be arbitrarily daisy-chained by feeding output registers D, E
// and F back into the next round as input registers A, B and C. The macro
// carefully interleaves memory operations with data operations for optimal
// pipelined performance.
#define ROUND_3(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
LOAD(D, (ROUND + 3)) /* V V V + */ \
SHUF(B, E) /* | | | | + */ \
STOR(A, (ROUND + 0)) /* - | | | | */ \
TRAN(B, E, F) /* | | | - x */ \
LOAD(E, (ROUND + 4)) /* | | | + */ \
SHUF(C, A) /* + | | | | */ \
STOR(B, (ROUND + 1)) /* | - | | | */ \
TRAN(C, A, F) /* - | | | x */ \
LOAD(F, (ROUND + 5)) /* | | | + */ \
SHUF(D, A) /* + | | | | */ \
STOR(C, (ROUND + 2)) /* | - | | | */ \
TRAN(D, A, B) /* - x V V V */
// Define a macro that terminates a ROUND_3 macro by taking pre-loaded
// registers D, E and F, and translating, shuffling and storing them.
#define ROUND_3_END(ROUND, A,B,C,D,E,F) /* A B C D E F */ \
SHUF(E, A) /* + V V V */ \
STOR(D, (ROUND + 3)) /* | - | | */ \
TRAN(E, A, B) /* - x | | */ \
SHUF(F, C) /* + | | */ \
STOR(E, (ROUND + 4)) /* | - | */ \
TRAN(F, C, D) /* - x | */ \
STOR(F, (ROUND + 5)) /* - */
// Define a type A round. Inputs are a, b, and c, outputs are d, e, and f.
#define ROUND_3_A(ROUND) \
ROUND_3(ROUND, "a", "b", "c", "d", "e", "f")
// Define a type B round. Inputs and outputs are swapped with regard to type A.
#define ROUND_3_B(ROUND) \
ROUND_3(ROUND, "d", "e", "f", "a", "b", "c")
// Terminating macro for a type A round.
#define ROUND_3_A_LAST(ROUND) \
ROUND_3_A(ROUND) \
ROUND_3_END(ROUND, "a", "b", "c", "d", "e", "f")
// Terminating macro for a type B round.
#define ROUND_3_B_LAST(ROUND) \
ROUND_3_B(ROUND) \
ROUND_3_END(ROUND, "d", "e", "f", "a", "b", "c")
// Suppress clang's warning that the literal string in the asm statement is
// overlong (longer than the ISO-mandated minimum size of 4095 bytes for C99
// compilers). It may be true, but the goal here is not C99 portability.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Woverlength-strings"
static inline void
enc_loop_ssse3 (const uint8_t **s, size_t *slen, uint8_t **o, size_t *olen)
{
// For a clearer explanation of the algorithm used by this function,
// please refer to the plain (not inline assembly) implementation. This
// function follows the same basic logic.
if (*slen < 16) {
return;
}
// Process blocks of 12 bytes at a time. Input is read in blocks of 16
// bytes, so "reserve" four bytes from the input buffer to ensure that
// we never read beyond the end of the input buffer.
size_t rounds = (*slen - 4) / 12;
*slen -= rounds * 12; // 12 bytes consumed per round
*olen += rounds * 16; // 16 bytes produced per round
// Number of times to go through the 36x loop.
size_t loops = rounds / 36;
// Number of rounds remaining after the 36x loop.
rounds %= 36;
// Lookup tables.
const __m128i lut0 = _mm_set_epi8(
10, 11, 9, 10, 7, 8, 6, 7, 4, 5, 3, 4, 1, 2, 0, 1);
const __m128i lut1 = _mm_setr_epi8(
65, 71, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -19, -16, 0, 0);
// Temporary registers.
__m128i a, b, c, d, e, f;
__asm__ volatile (
// If there are 36 rounds or more, enter a 36x unrolled loop of
// interleaved encoding rounds. The rounds interleave memory
// operations (load/store) with data operations (table lookups,
// etc) to maximize pipeline throughput.
" test %[loops], %[loops] \n\t"
" jz 18f \n\t"
" jmp 36f \n\t"
" \n\t"
".balign 64 \n\t"
"36: " ROUND_3_INIT()
" " ROUND_3_A( 0)
" " ROUND_3_B( 3)
" " ROUND_3_A( 6)
" " ROUND_3_B( 9)
" " ROUND_3_A(12)
" " ROUND_3_B(15)
" " ROUND_3_A(18)
" " ROUND_3_B(21)
" " ROUND_3_A(24)
" " ROUND_3_B(27)
" " ROUND_3_A_LAST(30)
" add $(12 * 36), %[src] \n\t"
" add $(16 * 36), %[dst] \n\t"
" dec %[loops] \n\t"
" jnz 36b \n\t"
// Enter an 18x unrolled loop for rounds of 18 or more.
"18: cmp $18, %[rounds] \n\t"
" jl 9f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B(3)
" " ROUND_3_A(6)
" " ROUND_3_B(9)
" " ROUND_3_A_LAST(12)
" sub $18, %[rounds] \n\t"
" add $(12 * 18), %[src] \n\t"
" add $(16 * 18), %[dst] \n\t"
// Enter a 9x unrolled loop for rounds of 9 or more.
"9: cmp $9, %[rounds] \n\t"
" jl 6f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A(0)
" " ROUND_3_B_LAST(3)
" sub $9, %[rounds] \n\t"
" add $(12 * 9), %[src] \n\t"
" add $(16 * 9), %[dst] \n\t"
// Enter a 6x unrolled loop for rounds of 6 or more.
"6: cmp $6, %[rounds] \n\t"
" jl 55f \n\t"
" " ROUND_3_INIT()
" " ROUND_3_A_LAST(0)
" sub $6, %[rounds] \n\t"
" add $(12 * 6), %[src] \n\t"
" add $(16 * 6), %[dst] \n\t"
// Dispatch the remaining rounds 0..5.
"55: cmp $3, %[rounds] \n\t"
" jg 45f \n\t"
" je 3f \n\t"
" cmp $1, %[rounds] \n\t"
" jg 2f \n\t"
" je 1f \n\t"
" jmp 0f \n\t"
"45: cmp $4, %[rounds] \n\t"
" je 4f \n\t"
// Block of non-interlaced encoding rounds, which can each
// individually be jumped to. Rounds fall through to the next.
"5: " ROUND()
"4: " ROUND()
"3: " ROUND()
"2: " ROUND()
"1: " ROUND()
"0: \n\t"
// Outputs (modified).
: [rounds] "+r" (rounds),
[loops] "+r" (loops),
[src] "+r" (*s),
[dst] "+r" (*o),
[a] "=&x" (a),
[b] "=&x" (b),
[c] "=&x" (c),
[d] "=&x" (d),
[e] "=&x" (e),
[f] "=&x" (f)
// Inputs (not modified).
: [lut0] "x" (lut0),
[lut1] "x" (lut1),
[msk0] "x" (_mm_set1_epi32(0x0FC0FC00)),
[msk1] "x" (_mm_set1_epi32(0x04000040)),
[msk2] "x" (_mm_set1_epi32(0x003F03F0)),
[msk3] "x" (_mm_set1_epi32(0x01000010)),
[n51] "x" (_mm_set1_epi8(51)),
[n25] "x" (_mm_set1_epi8(25))
// Clobbers.
: "cc", "memory"
);
}
#pragma GCC diagnostic pop

View File

@@ -0,0 +1,48 @@
static BASE64_FORCE_INLINE __m128i
enc_reshuffle (__m128i in)
{
// Input, bytes MSB to LSB:
// 0 0 0 0 l k j i h g f e d c b a
in = _mm_shuffle_epi8(in, _mm_set_epi8(
10, 11, 9, 10,
7, 8, 6, 7,
4, 5, 3, 4,
1, 2, 0, 1));
// in, bytes MSB to LSB:
// k l j k
// h i g h
// e f d e
// b c a b
const __m128i t0 = _mm_and_si128(in, _mm_set1_epi32(0x0FC0FC00));
// bits, upper case are most significant bits, lower case are least significant bits
// 0000kkkk LL000000 JJJJJJ00 00000000
// 0000hhhh II000000 GGGGGG00 00000000
// 0000eeee FF000000 DDDDDD00 00000000
// 0000bbbb CC000000 AAAAAA00 00000000
const __m128i t1 = _mm_mulhi_epu16(t0, _mm_set1_epi32(0x04000040));
// 00000000 00kkkkLL 00000000 00JJJJJJ
// 00000000 00hhhhII 00000000 00GGGGGG
// 00000000 00eeeeFF 00000000 00DDDDDD
// 00000000 00bbbbCC 00000000 00AAAAAA
const __m128i t2 = _mm_and_si128(in, _mm_set1_epi32(0x003F03F0));
// 00000000 00llllll 000000jj KKKK0000
// 00000000 00iiiiii 000000gg HHHH0000
// 00000000 00ffffff 000000dd EEEE0000
// 00000000 00cccccc 000000aa BBBB0000
const __m128i t3 = _mm_mullo_epi16(t2, _mm_set1_epi32(0x01000010));
// 00llllll 00000000 00jjKKKK 00000000
// 00iiiiii 00000000 00ggHHHH 00000000
// 00ffffff 00000000 00ddEEEE 00000000
// 00cccccc 00000000 00aaBBBB 00000000
return _mm_or_si128(t1, t3);
// 00llllll 00kkkkLL 00jjKKKK 00JJJJJJ
// 00iiiiii 00hhhhII 00ggHHHH 00GGGGGG
// 00ffffff 00eeeeFF 00ddEEEE 00DDDDDD
// 00cccccc 00bbbbCC 00aaBBBB 00AAAAAA
}

View File

@@ -0,0 +1,33 @@
static BASE64_FORCE_INLINE __m128i
enc_translate (const __m128i in)
{
// A lookup table containing the absolute offsets for all ranges:
const __m128i lut = _mm_setr_epi8(
65, 71, -4, -4,
-4, -4, -4, -4,
-4, -4, -4, -4,
-19, -16, 0, 0
);
// Translate values 0..63 to the Base64 alphabet. There are five sets:
// # From To Abs Index Characters
// 0 [0..25] [65..90] +65 0 ABCDEFGHIJKLMNOPQRSTUVWXYZ
// 1 [26..51] [97..122] +71 1 abcdefghijklmnopqrstuvwxyz
// 2 [52..61] [48..57] -4 [2..11] 0123456789
// 3 [62] [43] -19 12 +
// 4 [63] [47] -16 13 /
// Create LUT indices from the input. The index for range #0 is right,
// others are 1 less than expected:
__m128i indices = _mm_subs_epu8(in, _mm_set1_epi8(51));
// mask is 0xFF (-1) for range #[1..4] and 0x00 for range #0:
__m128i mask = _mm_cmpgt_epi8(in, _mm_set1_epi8(25));
// Subtract -1, so add 1 to indices for range #[1..4]. All indices are
// now correct:
indices = _mm_sub_epi8(indices, mask);
// Add offsets to input values:
return _mm_add_epi8(in, _mm_shuffle_epi8(lut, indices));
}

View File

@@ -0,0 +1,314 @@
#include <stdbool.h>
#include <stdint.h>
#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include "libbase64.h"
#include "codecs.h"
#include "config.h"
#include "env.h"
#if (__x86_64__ || __i386__ || _M_X86 || _M_X64)
#define BASE64_X86
#if (HAVE_SSSE3 || HAVE_SSE41 || HAVE_SSE42 || HAVE_AVX || HAVE_AVX2 || HAVE_AVX512)
#define BASE64_X86_SIMD
#endif
#endif
#ifdef BASE64_X86
#ifdef _MSC_VER
#include <intrin.h>
#define __cpuid_count(__level, __count, __eax, __ebx, __ecx, __edx) \
{ \
int info[4]; \
__cpuidex(info, __level, __count); \
__eax = info[0]; \
__ebx = info[1]; \
__ecx = info[2]; \
__edx = info[3]; \
}
#define __cpuid(__level, __eax, __ebx, __ecx, __edx) \
__cpuid_count(__level, 0, __eax, __ebx, __ecx, __edx)
#else
#include <cpuid.h>
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
#if ((__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 2) || (__clang_major__ >= 3))
static inline uint64_t _xgetbv (uint32_t index)
{
uint32_t eax, edx;
__asm__ __volatile__("xgetbv" : "=a"(eax), "=d"(edx) : "c"(index));
return ((uint64_t)edx << 32) | eax;
}
#else
#error "Platform not supported"
#endif
#endif
#endif
#ifndef bit_AVX512vl
#define bit_AVX512vl (1 << 31)
#endif
#ifndef bit_AVX512vbmi
#define bit_AVX512vbmi (1 << 1)
#endif
#ifndef bit_AVX2
#define bit_AVX2 (1 << 5)
#endif
#ifndef bit_SSSE3
#define bit_SSSE3 (1 << 9)
#endif
#ifndef bit_SSE41
#define bit_SSE41 (1 << 19)
#endif
#ifndef bit_SSE42
#define bit_SSE42 (1 << 20)
#endif
#ifndef bit_AVX
#define bit_AVX (1 << 28)
#endif
#define bit_XSAVE_XRSTORE (1 << 27)
#ifndef _XCR_XFEATURE_ENABLED_MASK
#define _XCR_XFEATURE_ENABLED_MASK 0
#endif
#define bit_XMM (1 << 1)
#define bit_YMM (1 << 2)
#define bit_OPMASK (1 << 5)
#define bit_ZMM (1 << 6)
#define bit_HIGH_ZMM (1 << 7)
#define _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS (bit_XMM | bit_YMM)
#define _AVX_512_ENABLED_BY_OS (bit_XMM | bit_YMM | bit_OPMASK | bit_ZMM | bit_HIGH_ZMM)
#endif
// Function declarations:
#define BASE64_CODEC_FUNCS(arch) \
extern void base64_stream_encode_ ## arch BASE64_ENC_PARAMS; \
extern int base64_stream_decode_ ## arch BASE64_DEC_PARAMS;
BASE64_CODEC_FUNCS(avx512)
BASE64_CODEC_FUNCS(avx2)
BASE64_CODEC_FUNCS(neon32)
BASE64_CODEC_FUNCS(neon64)
BASE64_CODEC_FUNCS(plain)
BASE64_CODEC_FUNCS(ssse3)
BASE64_CODEC_FUNCS(sse41)
BASE64_CODEC_FUNCS(sse42)
BASE64_CODEC_FUNCS(avx)
static bool
codec_choose_forced (struct codec *codec, int flags)
{
// If the user wants to use a certain codec,
// always allow it, even if the codec is a no-op.
// For testing purposes.
if (!(flags & 0xFFFF)) {
return false;
}
if (flags & BASE64_FORCE_AVX2) {
codec->enc = base64_stream_encode_avx2;
codec->dec = base64_stream_decode_avx2;
return true;
}
if (flags & BASE64_FORCE_NEON32) {
codec->enc = base64_stream_encode_neon32;
codec->dec = base64_stream_decode_neon32;
return true;
}
if (flags & BASE64_FORCE_NEON64) {
codec->enc = base64_stream_encode_neon64;
codec->dec = base64_stream_decode_neon64;
return true;
}
if (flags & BASE64_FORCE_PLAIN) {
codec->enc = base64_stream_encode_plain;
codec->dec = base64_stream_decode_plain;
return true;
}
if (flags & BASE64_FORCE_SSSE3) {
codec->enc = base64_stream_encode_ssse3;
codec->dec = base64_stream_decode_ssse3;
return true;
}
if (flags & BASE64_FORCE_SSE41) {
codec->enc = base64_stream_encode_sse41;
codec->dec = base64_stream_decode_sse41;
return true;
}
if (flags & BASE64_FORCE_SSE42) {
codec->enc = base64_stream_encode_sse42;
codec->dec = base64_stream_decode_sse42;
return true;
}
if (flags & BASE64_FORCE_AVX) {
codec->enc = base64_stream_encode_avx;
codec->dec = base64_stream_decode_avx;
return true;
}
if (flags & BASE64_FORCE_AVX512) {
codec->enc = base64_stream_encode_avx512;
codec->dec = base64_stream_decode_avx512;
return true;
}
return false;
}
static bool
codec_choose_arm (struct codec *codec)
{
#if HAVE_NEON64 || ((defined(__ARM_NEON__) || defined(__ARM_NEON)) && HAVE_NEON32)
// Unfortunately there is no portable way to check for NEON
// support at runtime from userland in the same way that x86
// has cpuid, so just stick to the compile-time configuration:
#if HAVE_NEON64
codec->enc = base64_stream_encode_neon64;
codec->dec = base64_stream_decode_neon64;
#else
codec->enc = base64_stream_encode_neon32;
codec->dec = base64_stream_decode_neon32;
#endif
return true;
#else
(void)codec;
return false;
#endif
}
static bool
codec_choose_x86 (struct codec *codec)
{
#ifdef BASE64_X86_SIMD
unsigned int eax, ebx = 0, ecx = 0, edx;
unsigned int max_level;
#ifdef _MSC_VER
int info[4];
__cpuidex(info, 0, 0);
max_level = info[0];
#else
max_level = __get_cpuid_max(0, NULL);
#endif
#if HAVE_AVX512 || HAVE_AVX2 || HAVE_AVX
// Check for AVX/AVX2/AVX512 support:
// Checking for AVX requires 3 things:
// 1) CPUID indicates that the OS uses XSAVE and XRSTORE instructions
// (allowing saving YMM registers on context switch)
// 2) CPUID indicates support for AVX
// 3) XGETBV indicates the AVX registers will be saved and restored on
// context switch
//
// Note that XGETBV is only available on 686 or later CPUs, so the
// instruction needs to be conditionally run.
if (max_level >= 1) {
__cpuid_count(1, 0, eax, ebx, ecx, edx);
if (ecx & bit_XSAVE_XRSTORE) {
uint64_t xcr_mask;
xcr_mask = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
if ((xcr_mask & _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) == _XCR_XMM_AND_YMM_STATE_ENABLED_BY_OS) { // check multiple bits at once
#if HAVE_AVX512
if (max_level >= 7 && ((xcr_mask & _AVX_512_ENABLED_BY_OS) == _AVX_512_ENABLED_BY_OS)) {
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if ((ebx & bit_AVX512vl) && (ecx & bit_AVX512vbmi)) {
codec->enc = base64_stream_encode_avx512;
codec->dec = base64_stream_decode_avx512;
return true;
}
}
#endif
#if HAVE_AVX2
if (max_level >= 7) {
__cpuid_count(7, 0, eax, ebx, ecx, edx);
if (ebx & bit_AVX2) {
codec->enc = base64_stream_encode_avx2;
codec->dec = base64_stream_decode_avx2;
return true;
}
}
#endif
#if HAVE_AVX
__cpuid_count(1, 0, eax, ebx, ecx, edx);
if (ecx & bit_AVX) {
codec->enc = base64_stream_encode_avx;
codec->dec = base64_stream_decode_avx;
return true;
}
#endif
}
}
}
#endif
#if HAVE_SSE42
// Check for SSE42 support:
if (max_level >= 1) {
__cpuid(1, eax, ebx, ecx, edx);
if (ecx & bit_SSE42) {
codec->enc = base64_stream_encode_sse42;
codec->dec = base64_stream_decode_sse42;
return true;
}
}
#endif
#if HAVE_SSE41
// Check for SSE41 support:
if (max_level >= 1) {
__cpuid(1, eax, ebx, ecx, edx);
if (ecx & bit_SSE41) {
codec->enc = base64_stream_encode_sse41;
codec->dec = base64_stream_decode_sse41;
return true;
}
}
#endif
#if HAVE_SSSE3
// Check for SSSE3 support:
if (max_level >= 1) {
__cpuid(1, eax, ebx, ecx, edx);
if (ecx & bit_SSSE3) {
codec->enc = base64_stream_encode_ssse3;
codec->dec = base64_stream_decode_ssse3;
return true;
}
}
#endif
#else
(void)codec;
#endif
return false;
}
void
codec_choose (struct codec *codec, int flags)
{
// User forced a codec:
if (codec_choose_forced(codec, flags)) {
return;
}
// Runtime feature detection:
if (codec_choose_arm(codec)) {
return;
}
if (codec_choose_x86(codec)) {
return;
}
codec->enc = base64_stream_encode_plain;
codec->dec = base64_stream_decode_plain;
}

View File

@@ -0,0 +1,57 @@
#include "libbase64.h"
// Function parameters for encoding functions:
#define BASE64_ENC_PARAMS \
( struct base64_state *state \
, const char *src \
, size_t srclen \
, char *out \
, size_t *outlen \
)
// Function parameters for decoding functions:
#define BASE64_DEC_PARAMS \
( struct base64_state *state \
, const char *src \
, size_t srclen \
, char *out \
, size_t *outlen \
)
// This function is used as a stub when a certain encoder is not compiled in.
// It discards the inputs and returns zero output bytes.
static inline void
base64_enc_stub BASE64_ENC_PARAMS
{
(void) state;
(void) src;
(void) srclen;
(void) out;
*outlen = 0;
}
// This function is used as a stub when a certain decoder is not compiled in.
// It discards the inputs and returns an invalid decoding result.
static inline int
base64_dec_stub BASE64_DEC_PARAMS
{
(void) state;
(void) src;
(void) srclen;
(void) out;
(void) outlen;
return -1;
}
typedef void (* base64_enc_fn) BASE64_ENC_PARAMS;
typedef int (* base64_dec_fn) BASE64_DEC_PARAMS;
struct codec
{
base64_enc_fn enc;
base64_dec_fn dec;
};
extern void codec_choose (struct codec *, int flags);

View File

@@ -0,0 +1,24 @@
#ifndef BASE64_CONFIG_H
#define BASE64_CONFIG_H
#if !defined(__APPLE__) && ((defined(__x86_64__) && defined(__LP64__)) || defined(_M_X64))
#define HAVE_SSSE3 1
#define HAVE_SSE41 1
#define HAVE_SSE42 1
#define HAVE_AVX 1
#define HAVE_AVX2 1
#define HAVE_AVX512 0
#endif
#define BASE64_WITH_NEON32 0
#define HAVE_NEON32 BASE64_WITH_NEON32
#if defined(__APPLE__) && defined(__aarch64__)
#define BASE64_WITH_NEON64 1
#else
#define BASE64_WITH_NEON64 0
#endif
#define HAVE_NEON64 BASE64_WITH_NEON64
#endif // BASE64_CONFIG_H

View File

@@ -0,0 +1,84 @@
#ifndef BASE64_ENV_H
#define BASE64_ENV_H
#include <stdint.h>
// This header file contains macro definitions that describe certain aspects of
// the compile-time environment. Compatibility and portability macros go here.
// Define machine endianness. This is for GCC:
#if (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
# define BASE64_LITTLE_ENDIAN 1
#else
# define BASE64_LITTLE_ENDIAN 0
#endif
// This is for Clang:
#ifdef __LITTLE_ENDIAN__
# define BASE64_LITTLE_ENDIAN 1
#endif
#ifdef __BIG_ENDIAN__
# define BASE64_LITTLE_ENDIAN 0
#endif
// MSVC++ needs intrin.h for _byteswap_uint64 (issue #68):
#if BASE64_LITTLE_ENDIAN && defined(_MSC_VER)
# include <intrin.h>
#endif
// Endian conversion functions:
#if BASE64_LITTLE_ENDIAN
# ifdef _MSC_VER
// Microsoft Visual C++:
# define BASE64_HTOBE32(x) _byteswap_ulong(x)
# define BASE64_HTOBE64(x) _byteswap_uint64(x)
# else
// GCC and Clang:
# define BASE64_HTOBE32(x) __builtin_bswap32(x)
# define BASE64_HTOBE64(x) __builtin_bswap64(x)
# endif
#else
// No conversion needed:
# define BASE64_HTOBE32(x) (x)
# define BASE64_HTOBE64(x) (x)
#endif
// Detect word size:
#if defined (__x86_64__)
// This also works for the x32 ABI, which has a 64-bit word size.
# define BASE64_WORDSIZE 64
#elif SIZE_MAX == UINT32_MAX
# define BASE64_WORDSIZE 32
#elif SIZE_MAX == UINT64_MAX
# define BASE64_WORDSIZE 64
#else
# error BASE64_WORDSIZE_NOT_DEFINED
#endif
// End-of-file definitions.
// Almost end-of-file when waiting for the last '=' character:
#define BASE64_AEOF 1
// End-of-file when stream end has been reached or invalid input provided:
#define BASE64_EOF 2
// GCC 7 defaults to issuing a warning for fallthrough in switch statements,
// unless the fallthrough cases are marked with an attribute. As we use
// fallthrough deliberately, define an alias for the attribute:
#if __GNUC__ >= 7
# define BASE64_FALLTHROUGH __attribute__((fallthrough));
#else
# define BASE64_FALLTHROUGH
#endif
// Declare macros to ensure that functions that are intended to be inlined, are
// actually inlined, even when no optimization is applied. A lot of inner loop
// code is factored into separate functions for reasons of readability, but
// that code should always be inlined (and optimized) in the main loop.
#ifdef _MSC_VER
# define BASE64_FORCE_INLINE __forceinline
#else
# define BASE64_FORCE_INLINE inline __attribute__((always_inline))
#endif
#endif // BASE64_ENV_H

View File

@@ -0,0 +1,164 @@
#include <stdint.h>
#include <stddef.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#include "libbase64.h"
#include "tables/tables.h"
#include "codecs.h"
#include "env.h"
// These static function pointers are initialized once when the library is
// first used, and remain in use for the remaining lifetime of the program.
// The idea being that CPU features don't change at runtime.
static struct codec codec = { NULL, NULL };
void
base64_stream_encode_init (struct base64_state *state, int flags)
{
// If any of the codec flags are set, redo choice:
if (codec.enc == NULL || flags & 0xFF) {
codec_choose(&codec, flags);
}
state->eof = 0;
state->bytes = 0;
state->carry = 0;
state->flags = flags;
}
void
base64_stream_encode
( struct base64_state *state
, const char *src
, size_t srclen
, char *out
, size_t *outlen
)
{
codec.enc(state, src, srclen, out, outlen);
}
void
base64_stream_encode_final
( struct base64_state *state
, char *out
, size_t *outlen
)
{
uint8_t *o = (uint8_t *)out;
if (state->bytes == 1) {
*o++ = base64_table_enc_6bit[state->carry];
*o++ = '=';
*o++ = '=';
*outlen = 3;
return;
}
if (state->bytes == 2) {
*o++ = base64_table_enc_6bit[state->carry];
*o++ = '=';
*outlen = 2;
return;
}
*outlen = 0;
}
void
base64_stream_decode_init (struct base64_state *state, int flags)
{
// If any of the codec flags are set, redo choice:
if (codec.dec == NULL || flags & 0xFFFF) {
codec_choose(&codec, flags);
}
state->eof = 0;
state->bytes = 0;
state->carry = 0;
state->flags = flags;
}
int
base64_stream_decode
( struct base64_state *state
, const char *src
, size_t srclen
, char *out
, size_t *outlen
)
{
return codec.dec(state, src, srclen, out, outlen);
}
#ifdef _OPENMP
// Due to the overhead of initializing OpenMP and creating a team of
// threads, we require the data length to be larger than a threshold:
#define OMP_THRESHOLD 20000
// Conditionally include OpenMP-accelerated codec implementations:
#include "lib_openmp.c"
#endif
void
base64_encode
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
)
{
size_t s;
size_t t;
struct base64_state state;
#ifdef _OPENMP
if (srclen >= OMP_THRESHOLD) {
base64_encode_openmp(src, srclen, out, outlen, flags);
return;
}
#endif
// Init the stream reader:
base64_stream_encode_init(&state, flags);
// Feed the whole string to the stream reader:
base64_stream_encode(&state, src, srclen, out, &s);
// Finalize the stream by writing trailer if any:
base64_stream_encode_final(&state, out + s, &t);
// Final output length is stream length plus tail:
*outlen = s + t;
}
int
base64_decode
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
)
{
int ret;
struct base64_state state;
#ifdef _OPENMP
if (srclen >= OMP_THRESHOLD) {
return base64_decode_openmp(src, srclen, out, outlen, flags);
}
#endif
// Init the stream reader:
base64_stream_decode_init(&state, flags);
// Feed the whole string to the stream reader:
ret = base64_stream_decode(&state, src, srclen, out, outlen);
// If when decoding a whole block, we're still waiting for input then fail:
if (ret && (state.bytes == 0)) {
return ret;
}
return 0;
}

View File

@@ -0,0 +1,146 @@
#ifndef LIBBASE64_H
#define LIBBASE64_H
#include <stddef.h> /* size_t */
#if defined(_WIN32) || defined(__CYGWIN__)
#define BASE64_SYMBOL_IMPORT __declspec(dllimport)
#define BASE64_SYMBOL_EXPORT __declspec(dllexport)
#define BASE64_SYMBOL_PRIVATE
#elif __GNUC__ >= 4
#define BASE64_SYMBOL_IMPORT __attribute__ ((visibility ("default")))
#define BASE64_SYMBOL_EXPORT __attribute__ ((visibility ("default")))
#define BASE64_SYMBOL_PRIVATE __attribute__ ((visibility ("hidden")))
#else
#define BASE64_SYMBOL_IMPORT
#define BASE64_SYMBOL_EXPORT
#define BASE64_SYMBOL_PRIVATE
#endif
#if defined(BASE64_STATIC_DEFINE)
#define BASE64_EXPORT
#define BASE64_NO_EXPORT
#else
#if defined(BASE64_EXPORTS) // defined if we are building the shared library
#define BASE64_EXPORT BASE64_SYMBOL_EXPORT
#else
#define BASE64_EXPORT BASE64_SYMBOL_IMPORT
#endif
#define BASE64_NO_EXPORT BASE64_SYMBOL_PRIVATE
#endif
#ifdef __cplusplus
extern "C" {
#endif
/* These are the flags that can be passed in the `flags` argument. The values
* below force the use of a given codec, even if that codec is a no-op in the
* current build. Used in testing. Set to 0 for the default behavior, which is
* runtime feature detection on x86, a compile-time fixed codec on ARM, and
* the plain codec on other platforms: */
#define BASE64_FORCE_AVX2 (1 << 0)
#define BASE64_FORCE_NEON32 (1 << 1)
#define BASE64_FORCE_NEON64 (1 << 2)
#define BASE64_FORCE_PLAIN (1 << 3)
#define BASE64_FORCE_SSSE3 (1 << 4)
#define BASE64_FORCE_SSE41 (1 << 5)
#define BASE64_FORCE_SSE42 (1 << 6)
#define BASE64_FORCE_AVX (1 << 7)
#define BASE64_FORCE_AVX512 (1 << 8)
struct base64_state {
int eof;
int bytes;
int flags;
unsigned char carry;
};
/* Wrapper function to encode a plain string of given length. Output is written
* to *out without trailing zero. Output length in bytes is written to *outlen.
* The buffer in `out` has been allocated by the caller and is at least 4/3 the
* size of the input. See above for `flags`; set to 0 for default operation: */
void BASE64_EXPORT base64_encode
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
) ;
/* Call this before calling base64_stream_encode() to init the state. See above
* for `flags`; set to 0 for default operation: */
void BASE64_EXPORT base64_stream_encode_init
( struct base64_state *state
, int flags
) ;
/* Encodes the block of data of given length at `src`, into the buffer at
* `out`. Caller is responsible for allocating a large enough out-buffer; it
* must be at least 4/3 the size of the in-buffer, but take some margin. Places
* the number of new bytes written into `outlen` (which is set to zero when the
* function starts). Does not zero-terminate or finalize the output. */
void BASE64_EXPORT base64_stream_encode
( struct base64_state *state
, const char *src
, size_t srclen
, char *out
, size_t *outlen
) ;
/* Finalizes the output begun by previous calls to `base64_stream_encode()`.
* Adds the required end-of-stream markers if appropriate. `outlen` is modified
* and will contain the number of new bytes written at `out` (which will quite
* often be zero). */
void BASE64_EXPORT base64_stream_encode_final
( struct base64_state *state
, char *out
, size_t *outlen
) ;
/* Wrapper function to decode a plain string of given length. Output is written
* to *out without trailing zero. Output length in bytes is written to *outlen.
* The buffer in `out` has been allocated by the caller and is at least 3/4 the
* size of the input. See above for `flags`, set to 0 for default operation: */
int BASE64_EXPORT base64_decode
( const char *src
, size_t srclen
, char *out
, size_t *outlen
, int flags
) ;
/* Call this before calling base64_stream_decode() to init the state. See above
* for `flags`; set to 0 for default operation: */
void BASE64_EXPORT base64_stream_decode_init
( struct base64_state *state
, int flags
) ;
/* Decodes the block of data of given length at `src`, into the buffer at
* `out`. Caller is responsible for allocating a large enough out-buffer; it
* must be at least 3/4 the size of the in-buffer, but take some margin. Places
* the number of new bytes written into `outlen` (which is set to zero when the
* function starts). Does not zero-terminate the output. Returns 1 if all is
* well, and 0 if a decoding error was found, such as an invalid character.
* Returns -1 if the chosen codec is not included in the current build. Used by
* the test harness to check whether a codec is available for testing. */
int BASE64_EXPORT base64_stream_decode
( struct base64_state *state
, const char *src
, size_t srclen
, char *out
, size_t *outlen
) ;
#ifdef __cplusplus
}
#endif
#endif /* LIBBASE64_H */

View File

@@ -0,0 +1,393 @@
#include <stdint.h>
#define CHAR62 '+'
#define CHAR63 '/'
#define CHARPAD '='
#if BASE64_LITTLE_ENDIAN
/* SPECIAL DECODE TABLES FOR LITTLE ENDIAN (INTEL) CPUS */
const uint32_t base64_table_dec_32bit_d0[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x000000f8, 0xffffffff, 0xffffffff, 0xffffffff, 0x000000fc,
0x000000d0, 0x000000d4, 0x000000d8, 0x000000dc, 0x000000e0, 0x000000e4,
0x000000e8, 0x000000ec, 0x000000f0, 0x000000f4, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00000004, 0x00000008, 0x0000000c, 0x00000010, 0x00000014, 0x00000018,
0x0000001c, 0x00000020, 0x00000024, 0x00000028, 0x0000002c, 0x00000030,
0x00000034, 0x00000038, 0x0000003c, 0x00000040, 0x00000044, 0x00000048,
0x0000004c, 0x00000050, 0x00000054, 0x00000058, 0x0000005c, 0x00000060,
0x00000064, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00000068, 0x0000006c, 0x00000070, 0x00000074, 0x00000078,
0x0000007c, 0x00000080, 0x00000084, 0x00000088, 0x0000008c, 0x00000090,
0x00000094, 0x00000098, 0x0000009c, 0x000000a0, 0x000000a4, 0x000000a8,
0x000000ac, 0x000000b0, 0x000000b4, 0x000000b8, 0x000000bc, 0x000000c0,
0x000000c4, 0x000000c8, 0x000000cc, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d1[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x0000e003, 0xffffffff, 0xffffffff, 0xffffffff, 0x0000f003,
0x00004003, 0x00005003, 0x00006003, 0x00007003, 0x00008003, 0x00009003,
0x0000a003, 0x0000b003, 0x0000c003, 0x0000d003, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00001000, 0x00002000, 0x00003000, 0x00004000, 0x00005000, 0x00006000,
0x00007000, 0x00008000, 0x00009000, 0x0000a000, 0x0000b000, 0x0000c000,
0x0000d000, 0x0000e000, 0x0000f000, 0x00000001, 0x00001001, 0x00002001,
0x00003001, 0x00004001, 0x00005001, 0x00006001, 0x00007001, 0x00008001,
0x00009001, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x0000a001, 0x0000b001, 0x0000c001, 0x0000d001, 0x0000e001,
0x0000f001, 0x00000002, 0x00001002, 0x00002002, 0x00003002, 0x00004002,
0x00005002, 0x00006002, 0x00007002, 0x00008002, 0x00009002, 0x0000a002,
0x0000b002, 0x0000c002, 0x0000d002, 0x0000e002, 0x0000f002, 0x00000003,
0x00001003, 0x00002003, 0x00003003, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d2[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00800f00, 0xffffffff, 0xffffffff, 0xffffffff, 0x00c00f00,
0x00000d00, 0x00400d00, 0x00800d00, 0x00c00d00, 0x00000e00, 0x00400e00,
0x00800e00, 0x00c00e00, 0x00000f00, 0x00400f00, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00400000, 0x00800000, 0x00c00000, 0x00000100, 0x00400100, 0x00800100,
0x00c00100, 0x00000200, 0x00400200, 0x00800200, 0x00c00200, 0x00000300,
0x00400300, 0x00800300, 0x00c00300, 0x00000400, 0x00400400, 0x00800400,
0x00c00400, 0x00000500, 0x00400500, 0x00800500, 0x00c00500, 0x00000600,
0x00400600, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00800600, 0x00c00600, 0x00000700, 0x00400700, 0x00800700,
0x00c00700, 0x00000800, 0x00400800, 0x00800800, 0x00c00800, 0x00000900,
0x00400900, 0x00800900, 0x00c00900, 0x00000a00, 0x00400a00, 0x00800a00,
0x00c00a00, 0x00000b00, 0x00400b00, 0x00800b00, 0x00c00b00, 0x00000c00,
0x00400c00, 0x00800c00, 0x00c00c00, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d3[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x003e0000, 0xffffffff, 0xffffffff, 0xffffffff, 0x003f0000,
0x00340000, 0x00350000, 0x00360000, 0x00370000, 0x00380000, 0x00390000,
0x003a0000, 0x003b0000, 0x003c0000, 0x003d0000, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00010000, 0x00020000, 0x00030000, 0x00040000, 0x00050000, 0x00060000,
0x00070000, 0x00080000, 0x00090000, 0x000a0000, 0x000b0000, 0x000c0000,
0x000d0000, 0x000e0000, 0x000f0000, 0x00100000, 0x00110000, 0x00120000,
0x00130000, 0x00140000, 0x00150000, 0x00160000, 0x00170000, 0x00180000,
0x00190000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x001a0000, 0x001b0000, 0x001c0000, 0x001d0000, 0x001e0000,
0x001f0000, 0x00200000, 0x00210000, 0x00220000, 0x00230000, 0x00240000,
0x00250000, 0x00260000, 0x00270000, 0x00280000, 0x00290000, 0x002a0000,
0x002b0000, 0x002c0000, 0x002d0000, 0x002e0000, 0x002f0000, 0x00300000,
0x00310000, 0x00320000, 0x00330000, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
#else
/* SPECIAL DECODE TABLES FOR BIG ENDIAN (IBM/MOTOROLA/SUN) CPUS */
const uint32_t base64_table_dec_32bit_d0[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xf8000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xfc000000,
0xd0000000, 0xd4000000, 0xd8000000, 0xdc000000, 0xe0000000, 0xe4000000,
0xe8000000, 0xec000000, 0xf0000000, 0xf4000000, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x04000000, 0x08000000, 0x0c000000, 0x10000000, 0x14000000, 0x18000000,
0x1c000000, 0x20000000, 0x24000000, 0x28000000, 0x2c000000, 0x30000000,
0x34000000, 0x38000000, 0x3c000000, 0x40000000, 0x44000000, 0x48000000,
0x4c000000, 0x50000000, 0x54000000, 0x58000000, 0x5c000000, 0x60000000,
0x64000000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x68000000, 0x6c000000, 0x70000000, 0x74000000, 0x78000000,
0x7c000000, 0x80000000, 0x84000000, 0x88000000, 0x8c000000, 0x90000000,
0x94000000, 0x98000000, 0x9c000000, 0xa0000000, 0xa4000000, 0xa8000000,
0xac000000, 0xb0000000, 0xb4000000, 0xb8000000, 0xbc000000, 0xc0000000,
0xc4000000, 0xc8000000, 0xcc000000, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d1[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x03e00000, 0xffffffff, 0xffffffff, 0xffffffff, 0x03f00000,
0x03400000, 0x03500000, 0x03600000, 0x03700000, 0x03800000, 0x03900000,
0x03a00000, 0x03b00000, 0x03c00000, 0x03d00000, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00100000, 0x00200000, 0x00300000, 0x00400000, 0x00500000, 0x00600000,
0x00700000, 0x00800000, 0x00900000, 0x00a00000, 0x00b00000, 0x00c00000,
0x00d00000, 0x00e00000, 0x00f00000, 0x01000000, 0x01100000, 0x01200000,
0x01300000, 0x01400000, 0x01500000, 0x01600000, 0x01700000, 0x01800000,
0x01900000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x01a00000, 0x01b00000, 0x01c00000, 0x01d00000, 0x01e00000,
0x01f00000, 0x02000000, 0x02100000, 0x02200000, 0x02300000, 0x02400000,
0x02500000, 0x02600000, 0x02700000, 0x02800000, 0x02900000, 0x02a00000,
0x02b00000, 0x02c00000, 0x02d00000, 0x02e00000, 0x02f00000, 0x03000000,
0x03100000, 0x03200000, 0x03300000, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d2[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x000f8000, 0xffffffff, 0xffffffff, 0xffffffff, 0x000fc000,
0x000d0000, 0x000d4000, 0x000d8000, 0x000dc000, 0x000e0000, 0x000e4000,
0x000e8000, 0x000ec000, 0x000f0000, 0x000f4000, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00004000, 0x00008000, 0x0000c000, 0x00010000, 0x00014000, 0x00018000,
0x0001c000, 0x00020000, 0x00024000, 0x00028000, 0x0002c000, 0x00030000,
0x00034000, 0x00038000, 0x0003c000, 0x00040000, 0x00044000, 0x00048000,
0x0004c000, 0x00050000, 0x00054000, 0x00058000, 0x0005c000, 0x00060000,
0x00064000, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00068000, 0x0006c000, 0x00070000, 0x00074000, 0x00078000,
0x0007c000, 0x00080000, 0x00084000, 0x00088000, 0x0008c000, 0x00090000,
0x00094000, 0x00098000, 0x0009c000, 0x000a0000, 0x000a4000, 0x000a8000,
0x000ac000, 0x000b0000, 0x000b4000, 0x000b8000, 0x000bc000, 0x000c0000,
0x000c4000, 0x000c8000, 0x000cc000, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
const uint32_t base64_table_dec_32bit_d3[256] = {
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00003e00, 0xffffffff, 0xffffffff, 0xffffffff, 0x00003f00,
0x00003400, 0x00003500, 0x00003600, 0x00003700, 0x00003800, 0x00003900,
0x00003a00, 0x00003b00, 0x00003c00, 0x00003d00, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000,
0x00000100, 0x00000200, 0x00000300, 0x00000400, 0x00000500, 0x00000600,
0x00000700, 0x00000800, 0x00000900, 0x00000a00, 0x00000b00, 0x00000c00,
0x00000d00, 0x00000e00, 0x00000f00, 0x00001000, 0x00001100, 0x00001200,
0x00001300, 0x00001400, 0x00001500, 0x00001600, 0x00001700, 0x00001800,
0x00001900, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0x00001a00, 0x00001b00, 0x00001c00, 0x00001d00, 0x00001e00,
0x00001f00, 0x00002000, 0x00002100, 0x00002200, 0x00002300, 0x00002400,
0x00002500, 0x00002600, 0x00002700, 0x00002800, 0x00002900, 0x00002a00,
0x00002b00, 0x00002c00, 0x00002d00, 0x00002e00, 0x00002f00, 0x00003000,
0x00003100, 0x00003200, 0x00003300, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff,
0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff
};
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,40 @@
#include "tables.h"
const uint8_t
base64_table_enc_6bit[] =
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"abcdefghijklmnopqrstuvwxyz"
"0123456789"
"+/";
// In the lookup table below, note that the value for '=' (character 61) is
// 254, not 255. This character is used for in-band signaling of the end of
// the datastream, and we will use that later. The characters A-Z, a-z, 0-9
// and + / are mapped to their "decoded" values. The other bytes all map to
// the value 255, which flags them as "invalid input".
const uint8_t
base64_table_dec_8bit[] =
{
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 0..15
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 16..31
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 62, 255, 255, 255, 63, // 32..47
52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 255, 255, 255, 254, 255, 255, // 48..63
255, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, // 64..79
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 255, 255, 255, 255, 255, // 80..95
255, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, // 96..111
41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 255, 255, 255, 255, 255, // 112..127
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, // 128..143
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
};
#if BASE64_WORDSIZE >= 32
# include "table_dec_32bit.h"
# include "table_enc_12bit.h"
#endif

View File

@@ -0,0 +1,23 @@
#ifndef BASE64_TABLES_H
#define BASE64_TABLES_H
#include <stdint.h>
#include "../env.h"
// These tables are used by all codecs for fallback plain encoding/decoding:
extern const uint8_t base64_table_enc_6bit[];
extern const uint8_t base64_table_dec_8bit[];
// These tables are used for the 32-bit and 64-bit generic decoders:
#if BASE64_WORDSIZE >= 32
extern const uint32_t base64_table_dec_32bit_d0[];
extern const uint32_t base64_table_dec_32bit_d1[];
extern const uint32_t base64_table_dec_32bit_d2[];
extern const uint32_t base64_table_dec_32bit_d3[];
// This table is used by the 32 and 64-bit generic encoders:
extern const uint16_t base64_table_enc_12bit[];
#endif
#endif // BASE64_TABLES_H

View File

@@ -0,0 +1,164 @@
// Bytes primitive operations
//
// These are registered in mypyc.primitives.bytes_ops.
#include <Python.h>
#include "CPy.h"
// Returns -1 on error, 0 on inequality, 1 on equality.
//
// Falls back to PyObject_RichCompareBool.
int CPyBytes_Compare(PyObject *left, PyObject *right) {
if (PyBytes_CheckExact(left) && PyBytes_CheckExact(right)) {
if (left == right) {
return 1;
}
// Adapted from cpython internal implementation of bytes_compare.
Py_ssize_t len = Py_SIZE(left);
if (Py_SIZE(right) != len) {
return 0;
}
PyBytesObject *left_b = (PyBytesObject *)left;
PyBytesObject *right_b = (PyBytesObject *)right;
if (left_b->ob_sval[0] != right_b->ob_sval[0]) {
return 0;
}
return memcmp(left_b->ob_sval, right_b->ob_sval, len) == 0;
}
return PyObject_RichCompareBool(left, right, Py_EQ);
}
CPyTagged CPyBytes_GetItem(PyObject *o, CPyTagged index) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = ((PyVarObject *)o)->ob_size;
if (n < 0)
n += size;
if (n < 0 || n >= size) {
PyErr_SetString(PyExc_IndexError, "index out of range");
return CPY_INT_TAG;
}
unsigned char num = PyBytes_Check(o) ? ((PyBytesObject *)o)->ob_sval[n]
: ((PyByteArrayObject *)o)->ob_bytes[n];
return num << 1;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return CPY_INT_TAG;
}
}
PyObject *CPyBytes_Concat(PyObject *a, PyObject *b) {
if (PyBytes_Check(a) && PyBytes_Check(b)) {
Py_ssize_t a_len = ((PyVarObject *)a)->ob_size;
Py_ssize_t b_len = ((PyVarObject *)b)->ob_size;
PyBytesObject *ret = (PyBytesObject *)PyBytes_FromStringAndSize(NULL, a_len + b_len);
if (ret != NULL) {
memcpy(ret->ob_sval, ((PyBytesObject *)a)->ob_sval, a_len);
memcpy(ret->ob_sval + a_len, ((PyBytesObject *)b)->ob_sval, b_len);
}
return (PyObject *)ret;
} else if (PyByteArray_Check(a)) {
return PyByteArray_Concat(a, b);
} else {
PyBytes_Concat(&a, b);
return a;
}
}
static inline Py_ssize_t Clamp(Py_ssize_t a, Py_ssize_t b, Py_ssize_t c) {
return a < b ? b : (a >= c ? c : a);
}
PyObject *CPyBytes_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
if ((PyBytes_Check(obj) || PyByteArray_Check(obj))
&& CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end)) {
Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
Py_ssize_t len = ((PyVarObject *)obj)->ob_size;
if (startn < 0) {
startn += len;
}
if (endn < 0) {
endn += len;
}
startn = Clamp(startn, 0, len);
endn = Clamp(endn, 0, len);
Py_ssize_t slice_len = endn - startn;
if (PyBytes_Check(obj)) {
return PyBytes_FromStringAndSize(PyBytes_AS_STRING(obj) + startn, slice_len);
} else {
return PyByteArray_FromStringAndSize(PyByteArray_AS_STRING(obj) + startn, slice_len);
}
}
return CPyObject_GetSlice(obj, start, end);
}
// Like _PyBytes_Join but fallback to dynamic call if 'sep' is not bytes
// (mostly commonly, for bytearrays)
PyObject *CPyBytes_Join(PyObject *sep, PyObject *iter) {
if (PyBytes_CheckExact(sep)) {
return PyBytes_Join(sep, iter);
} else {
_Py_IDENTIFIER(join);
PyObject *name = _PyUnicode_FromId(&PyId_join); /* borrowed */
if (name == NULL) {
return NULL;
}
return PyObject_CallMethodOneArg(sep, name, iter);
}
}
PyObject *CPyBytes_Build(Py_ssize_t len, ...) {
Py_ssize_t i;
Py_ssize_t sz = 0;
va_list args;
va_start(args, len);
for (i = 0; i < len; i++) {
PyObject *item = va_arg(args, PyObject *);
size_t add_sz = ((PyVarObject *)item)->ob_size;
// Using size_t to avoid overflow during arithmetic calculation
if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
PyErr_SetString(PyExc_OverflowError,
"join() result is too long for a Python bytes");
return NULL;
}
sz += add_sz;
}
va_end(args);
PyBytesObject *ret = (PyBytesObject *)PyBytes_FromStringAndSize(NULL, sz);
if (ret != NULL) {
char *res_data = ret->ob_sval;
va_start(args, len);
for (i = 0; i < len; i++) {
PyObject *item = va_arg(args, PyObject *);
Py_ssize_t item_sz = ((PyVarObject *)item)->ob_size;
memcpy(res_data, ((PyBytesObject *)item)->ob_sval, item_sz);
res_data += item_sz;
}
va_end(args);
assert(res_data == ret->ob_sval + ((PyVarObject *)ret)->ob_size);
}
return (PyObject *)ret;
}
CPyTagged CPyBytes_Ord(PyObject *obj) {
if (PyBytes_Check(obj)) {
Py_ssize_t s = PyBytes_GET_SIZE(obj);
if (s == 1) {
return (unsigned char)(PyBytes_AS_STRING(obj)[0]) << 1;
}
} else if (PyByteArray_Check(obj)) {
Py_ssize_t s = PyByteArray_GET_SIZE(obj);
if (s == 1) {
return (unsigned char)(PyByteArray_AS_STRING(obj)[0]) << 1;
}
}
PyErr_SetString(PyExc_TypeError, "ord() expects a character");
return CPY_INT_TAG;
}

View File

@@ -0,0 +1,491 @@
// Dict primitive operations
//
// These are registered in mypyc.primitives.dict_ops.
#include <Python.h>
#include "CPy.h"
#ifndef Py_TPFLAGS_MAPPING
#define Py_TPFLAGS_MAPPING (1 << 6)
#endif
// Dict subclasses like defaultdict override things in interesting
// ways, so we don't want to just directly use the dict methods. Not
// sure if it is actually worth doing all this stuff, but it saves
// some indirections.
PyObject *CPyDict_GetItem(PyObject *dict, PyObject *key) {
if (PyDict_CheckExact(dict)) {
PyObject *res = PyDict_GetItemWithError(dict, key);
if (!res) {
if (!PyErr_Occurred()) {
PyErr_SetObject(PyExc_KeyError, key);
}
} else {
Py_INCREF(res);
}
return res;
} else {
return PyObject_GetItem(dict, key);
}
}
PyObject *CPyDict_Build(Py_ssize_t size, ...) {
Py_ssize_t i;
PyObject *res = _PyDict_NewPresized(size);
if (res == NULL) {
return NULL;
}
va_list args;
va_start(args, size);
for (i = 0; i < size; i++) {
PyObject *key = va_arg(args, PyObject *);
PyObject *value = va_arg(args, PyObject *);
if (PyDict_SetItem(res, key, value)) {
Py_DECREF(res);
return NULL;
}
}
va_end(args);
return res;
}
PyObject *CPyDict_Get(PyObject *dict, PyObject *key, PyObject *fallback) {
// We are dodgily assuming that get on a subclass doesn't have
// different behavior.
PyObject *res = PyDict_GetItemWithError(dict, key);
if (!res) {
if (PyErr_Occurred()) {
return NULL;
}
res = fallback;
}
Py_INCREF(res);
return res;
}
PyObject *CPyDict_GetWithNone(PyObject *dict, PyObject *key) {
return CPyDict_Get(dict, key, Py_None);
}
PyObject *CPyDict_SetDefault(PyObject *dict, PyObject *key, PyObject *value) {
if (PyDict_CheckExact(dict)) {
PyObject* ret = PyDict_SetDefault(dict, key, value);
Py_XINCREF(ret);
return ret;
}
_Py_IDENTIFIER(setdefault);
PyObject *name = _PyUnicode_FromId(&PyId_setdefault); /* borrowed */
if (name == NULL) {
return NULL;
}
return PyObject_CallMethodObjArgs(dict, name, key, value, NULL);
}
PyObject *CPyDict_SetDefaultWithNone(PyObject *dict, PyObject *key) {
return CPyDict_SetDefault(dict, key, Py_None);
}
PyObject *CPyDict_SetDefaultWithEmptyDatatype(PyObject *dict, PyObject *key,
int data_type) {
PyObject *res = CPyDict_GetItem(dict, key);
if (!res) {
// CPyDict_GetItem() would generates a PyExc_KeyError
// when key is not found.
PyErr_Clear();
PyObject *new_obj;
if (data_type == 1) {
new_obj = PyList_New(0);
} else if (data_type == 2) {
new_obj = PyDict_New();
} else if (data_type == 3) {
new_obj = PySet_New(NULL);
} else {
return NULL;
}
if (CPyDict_SetItem(dict, key, new_obj) == -1) {
return NULL;
} else {
return new_obj;
}
} else {
return res;
}
}
int CPyDict_SetItem(PyObject *dict, PyObject *key, PyObject *value) {
if (PyDict_CheckExact(dict)) {
return PyDict_SetItem(dict, key, value);
} else {
return PyObject_SetItem(dict, key, value);
}
}
static inline int CPy_ObjectToStatus(PyObject *obj) {
if (obj) {
Py_DECREF(obj);
return 0;
} else {
return -1;
}
}
static int CPyDict_UpdateGeneral(PyObject *dict, PyObject *stuff) {
_Py_IDENTIFIER(update);
PyObject *name = _PyUnicode_FromId(&PyId_update); /* borrowed */
if (name == NULL) {
return -1;
}
PyObject *res = PyObject_CallMethodOneArg(dict, name, stuff);
return CPy_ObjectToStatus(res);
}
int CPyDict_UpdateInDisplay(PyObject *dict, PyObject *stuff) {
// from https://github.com/python/cpython/blob/55d035113dfb1bd90495c8571758f504ae8d4802/Python/ceval.c#L2710
int ret = PyDict_Update(dict, stuff);
if (ret < 0) {
if (PyErr_ExceptionMatches(PyExc_AttributeError)) {
PyErr_Format(PyExc_TypeError,
"'%.200s' object is not a mapping",
Py_TYPE(stuff)->tp_name);
}
}
return ret;
}
int CPyDict_Update(PyObject *dict, PyObject *stuff) {
if (PyDict_CheckExact(dict)) {
return PyDict_Update(dict, stuff);
} else {
return CPyDict_UpdateGeneral(dict, stuff);
}
}
int CPyDict_UpdateFromAny(PyObject *dict, PyObject *stuff) {
if (PyDict_CheckExact(dict)) {
// Argh this sucks
_Py_IDENTIFIER(keys);
if (PyDict_Check(stuff) || _CPyObject_HasAttrId(stuff, &PyId_keys)) {
return PyDict_Update(dict, stuff);
} else {
return PyDict_MergeFromSeq2(dict, stuff, 1);
}
} else {
return CPyDict_UpdateGeneral(dict, stuff);
}
}
PyObject *CPyDict_FromAny(PyObject *obj) {
if (PyDict_Check(obj)) {
return PyDict_Copy(obj);
} else {
int res;
PyObject *dict = PyDict_New();
if (!dict) {
return NULL;
}
_Py_IDENTIFIER(keys);
if (_CPyObject_HasAttrId(obj, &PyId_keys)) {
res = PyDict_Update(dict, obj);
} else {
res = PyDict_MergeFromSeq2(dict, obj, 1);
}
if (res < 0) {
Py_DECREF(dict);
return NULL;
}
return dict;
}
}
PyObject *CPyDict_KeysView(PyObject *dict) {
if (PyDict_CheckExact(dict)){
return _CPyDictView_New(dict, &PyDictKeys_Type);
}
_Py_IDENTIFIER(keys);
PyObject *name = _PyUnicode_FromId(&PyId_keys); /* borrowed */
if (name == NULL) {
return NULL;
}
return PyObject_CallMethodNoArgs(dict, name);
}
PyObject *CPyDict_ValuesView(PyObject *dict) {
if (PyDict_CheckExact(dict)){
return _CPyDictView_New(dict, &PyDictValues_Type);
}
_Py_IDENTIFIER(values);
PyObject *name = _PyUnicode_FromId(&PyId_values); /* borrowed */
if (name == NULL) {
return NULL;
}
return PyObject_CallMethodNoArgs(dict, name);
}
PyObject *CPyDict_ItemsView(PyObject *dict) {
if (PyDict_CheckExact(dict)){
return _CPyDictView_New(dict, &PyDictItems_Type);
}
_Py_IDENTIFIER(items);
PyObject *name = _PyUnicode_FromId(&PyId_items); /* borrowed */
if (name == NULL) {
return NULL;
}
return PyObject_CallMethodNoArgs(dict, name);
}
PyObject *CPyDict_Keys(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
return PyDict_Keys(dict);
}
// Inline generic fallback logic to also return a list.
PyObject *list = PyList_New(0);
_Py_IDENTIFIER(keys);
PyObject *name = _PyUnicode_FromId(&PyId_keys); /* borrowed */
if (name == NULL) {
return NULL;
}
PyObject *view = PyObject_CallMethodNoArgs(dict, name);
if (view == NULL) {
return NULL;
}
int res = PyList_Extend(list, view);
Py_DECREF(view);
if (res < 0) {
return NULL;
}
return list;
}
PyObject *CPyDict_Values(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
return PyDict_Values(dict);
}
// Inline generic fallback logic to also return a list.
PyObject *list = PyList_New(0);
_Py_IDENTIFIER(values);
PyObject *name = _PyUnicode_FromId(&PyId_values); /* borrowed */
if (name == NULL) {
return NULL;
}
PyObject *view = PyObject_CallMethodNoArgs(dict, name);
if (view == NULL) {
return NULL;
}
int res = PyList_Extend(list, view);
Py_DECREF(view);
if (res < 0) {
return NULL;
}
return list;
}
PyObject *CPyDict_Items(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
return PyDict_Items(dict);
}
// Inline generic fallback logic to also return a list.
PyObject *list = PyList_New(0);
_Py_IDENTIFIER(items);
PyObject *name = _PyUnicode_FromId(&PyId_items); /* borrowed */
if (name == NULL) {
return NULL;
}
PyObject *view = PyObject_CallMethodNoArgs(dict, name);
if (view == NULL) {
return NULL;
}
int res = PyList_Extend(list, view);
Py_DECREF(view);
if (res < 0) {
return NULL;
}
return list;
}
char CPyDict_Clear(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
PyDict_Clear(dict);
} else {
_Py_IDENTIFIER(clear);
PyObject *name = _PyUnicode_FromId(&PyId_clear); /* borrowed */
if (name == NULL) {
return 0;
}
PyObject *res = PyObject_CallMethodNoArgs(dict, name);
if (res == NULL) {
return 0;
}
}
return 1;
}
PyObject *CPyDict_Copy(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
return PyDict_Copy(dict);
}
_Py_IDENTIFIER(copy);
PyObject *name = _PyUnicode_FromId(&PyId_copy); /* borrowed */
if (name == NULL) {
return NULL;
}
return PyObject_CallMethodNoArgs(dict, name);
}
PyObject *CPyDict_GetKeysIter(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
// Return dict itself to indicate we can use fast path instead.
Py_INCREF(dict);
return dict;
}
return PyObject_GetIter(dict);
}
PyObject *CPyDict_GetItemsIter(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
// Return dict itself to indicate we can use fast path instead.
Py_INCREF(dict);
return dict;
}
_Py_IDENTIFIER(items);
PyObject *name = _PyUnicode_FromId(&PyId_items); /* borrowed */
if (name == NULL) {
return NULL;
}
PyObject *view = PyObject_CallMethodNoArgs(dict, name);
if (view == NULL) {
return NULL;
}
PyObject *iter = PyObject_GetIter(view);
Py_DECREF(view);
return iter;
}
PyObject *CPyDict_GetValuesIter(PyObject *dict) {
if (PyDict_CheckExact(dict)) {
// Return dict itself to indicate we can use fast path instead.
Py_INCREF(dict);
return dict;
}
_Py_IDENTIFIER(values);
PyObject *name = _PyUnicode_FromId(&PyId_values); /* borrowed */
if (name == NULL) {
return NULL;
}
PyObject *view = PyObject_CallMethodNoArgs(dict, name);
if (view == NULL) {
return NULL;
}
PyObject *iter = PyObject_GetIter(view);
Py_DECREF(view);
return iter;
}
static void _CPyDict_FromNext(tuple_T3CIO *ret, PyObject *dict_iter) {
// Get next item from iterator and set "should continue" flag.
ret->f2 = PyIter_Next(dict_iter);
if (ret->f2 == NULL) {
ret->f0 = 0;
Py_INCREF(Py_None);
ret->f2 = Py_None;
} else {
ret->f0 = 1;
}
}
// Helpers for fast dictionary iteration, return a single tuple
// instead of writing to multiple registers, for exact dicts use
// the fast path, and fall back to generic iterator logic for subclasses.
tuple_T3CIO CPyDict_NextKey(PyObject *dict_or_iter, CPyTagged offset) {
tuple_T3CIO ret;
Py_ssize_t py_offset = CPyTagged_AsSsize_t(offset);
PyObject *dummy;
if (PyDict_CheckExact(dict_or_iter)) {
ret.f0 = PyDict_Next(dict_or_iter, &py_offset, &ret.f2, &dummy);
if (ret.f0) {
ret.f1 = CPyTagged_FromSsize_t(py_offset);
} else {
// Set key to None, so mypyc can manage refcounts.
ret.f1 = 0;
ret.f2 = Py_None;
}
// PyDict_Next() returns borrowed references.
Py_INCREF(ret.f2);
} else {
// offset is dummy in this case, just use the old value.
ret.f1 = offset;
_CPyDict_FromNext(&ret, dict_or_iter);
}
return ret;
}
tuple_T3CIO CPyDict_NextValue(PyObject *dict_or_iter, CPyTagged offset) {
tuple_T3CIO ret;
Py_ssize_t py_offset = CPyTagged_AsSsize_t(offset);
PyObject *dummy;
if (PyDict_CheckExact(dict_or_iter)) {
ret.f0 = PyDict_Next(dict_or_iter, &py_offset, &dummy, &ret.f2);
if (ret.f0) {
ret.f1 = CPyTagged_FromSsize_t(py_offset);
} else {
// Set value to None, so mypyc can manage refcounts.
ret.f1 = 0;
ret.f2 = Py_None;
}
// PyDict_Next() returns borrowed references.
Py_INCREF(ret.f2);
} else {
// offset is dummy in this case, just use the old value.
ret.f1 = offset;
_CPyDict_FromNext(&ret, dict_or_iter);
}
return ret;
}
tuple_T4CIOO CPyDict_NextItem(PyObject *dict_or_iter, CPyTagged offset) {
tuple_T4CIOO ret;
Py_ssize_t py_offset = CPyTagged_AsSsize_t(offset);
if (PyDict_CheckExact(dict_or_iter)) {
ret.f0 = PyDict_Next(dict_or_iter, &py_offset, &ret.f2, &ret.f3);
if (ret.f0) {
ret.f1 = CPyTagged_FromSsize_t(py_offset);
} else {
// Set key and value to None, so mypyc can manage refcounts.
ret.f1 = 0;
ret.f2 = Py_None;
ret.f3 = Py_None;
}
} else {
ret.f1 = offset;
PyObject *item = PyIter_Next(dict_or_iter);
if (item == NULL || !PyTuple_Check(item) || PyTuple_GET_SIZE(item) != 2) {
if (item != NULL) {
PyErr_SetString(PyExc_TypeError, "a tuple of length 2 expected");
}
ret.f0 = 0;
ret.f2 = Py_None;
ret.f3 = Py_None;
} else {
ret.f0 = 1;
ret.f2 = PyTuple_GET_ITEM(item, 0);
ret.f3 = PyTuple_GET_ITEM(item, 1);
Py_DECREF(item);
}
}
// PyDict_Next() returns borrowed references.
Py_INCREF(ret.f2);
Py_INCREF(ret.f3);
return ret;
}
int CPyMapping_Check(PyObject *obj) {
return Py_TYPE(obj)->tp_flags & Py_TPFLAGS_MAPPING;
}

View File

@@ -0,0 +1,261 @@
#include "pythoncapi_compat.h"
// Exception related primitive operations
//
// These are registered in mypyc.primitives.exc_ops.
#include <Python.h>
#include "CPy.h"
void CPy_Raise(PyObject *exc) {
if (PyObject_IsInstance(exc, (PyObject *)&PyType_Type)) {
PyObject *obj = PyObject_CallNoArgs(exc);
if (!obj)
return;
PyErr_SetObject(exc, obj);
Py_DECREF(obj);
} else {
PyErr_SetObject((PyObject *)Py_TYPE(exc), exc);
}
}
void CPy_Reraise(void) {
PyObject *p_type, *p_value, *p_traceback;
PyErr_GetExcInfo(&p_type, &p_value, &p_traceback);
PyErr_Restore(p_type, p_value, p_traceback);
}
void CPyErr_SetObjectAndTraceback(PyObject *type, PyObject *value, PyObject *traceback) {
if (!PyType_Check(type) && Py_IsNone(value)) {
// The first argument must be an exception instance
value = type;
type = (PyObject *)Py_TYPE(value);
}
// Set the value and traceback of an error. Because calling
// PyErr_Restore takes away a reference to each object passed in
// as an argument, we manually increase the reference count of
// each argument before calling it.
Py_INCREF(type);
Py_INCREF(value);
Py_INCREF(traceback);
PyErr_Restore(type, value, traceback);
}
tuple_T3OOO CPy_CatchError(void) {
// We need to return the existing sys.exc_info() information, so
// that it can be restored when we finish handling the error we
// are catching now. Grab that triple and convert NULL values to
// the ExcDummy object in order to simplify refcount handling in
// generated code.
tuple_T3OOO ret;
PyErr_GetExcInfo(&ret.f0, &ret.f1, &ret.f2);
_CPy_ToDummy(&ret.f0);
_CPy_ToDummy(&ret.f1);
_CPy_ToDummy(&ret.f2);
if (!PyErr_Occurred()) {
PyErr_SetString(PyExc_RuntimeError, "CPy_CatchError called with no error!");
}
// Retrieve the error info and normalize it so that it looks like
// what python code needs it to be.
PyObject *type, *value, *traceback;
PyErr_Fetch(&type, &value, &traceback);
// Could we avoid always normalizing?
PyErr_NormalizeException(&type, &value, &traceback);
if (traceback != NULL) {
PyException_SetTraceback(value, traceback);
}
// Indicate that we are now handling this exception by stashing it
// in sys.exc_info(). mypyc routines that need access to the
// exception will read it out of there.
PyErr_SetExcInfo(type, value, traceback);
// Clear the error indicator, since the exception isn't
// propagating anymore.
PyErr_Clear();
return ret;
}
void CPy_RestoreExcInfo(tuple_T3OOO info) {
PyErr_SetExcInfo(_CPy_FromDummy(info.f0), _CPy_FromDummy(info.f1), _CPy_FromDummy(info.f2));
}
bool CPy_ExceptionMatches(PyObject *type) {
return PyErr_GivenExceptionMatches((PyObject *)Py_TYPE(CPy_ExcState()->exc_value), type);
}
PyObject *CPy_GetExcValue(void) {
PyObject *exc = CPy_ExcState()->exc_value;
Py_INCREF(exc);
return exc;
}
static inline void _CPy_ToNone(PyObject **p) {
if (*p == NULL) {
Py_INCREF(Py_None);
*p = Py_None;
}
}
void _CPy_GetExcInfo(PyObject **p_type, PyObject **p_value, PyObject **p_traceback) {
PyErr_GetExcInfo(p_type, p_value, p_traceback);
_CPy_ToNone(p_type);
_CPy_ToNone(p_value);
_CPy_ToNone(p_traceback);
}
tuple_T3OOO CPy_GetExcInfo(void) {
tuple_T3OOO ret;
_CPy_GetExcInfo(&ret.f0, &ret.f1, &ret.f2);
return ret;
}
void CPyError_OutOfMemory(void) {
fprintf(stderr, "fatal: out of memory\n");
fflush(stderr);
abort();
}
// Construct a nicely formatted type name based on __module__ and __name__.
static PyObject *CPy_GetTypeName(PyObject *type) {
PyObject *module = NULL, *name = NULL;
PyObject *full = NULL;
module = PyObject_GetAttrString(type, "__module__");
if (!module || !PyUnicode_Check(module)) {
goto out;
}
name = PyObject_GetAttrString(type, "__qualname__");
if (!name || !PyUnicode_Check(name)) {
goto out;
}
if (PyUnicode_CompareWithASCIIString(module, "builtins") == 0) {
Py_INCREF(name);
full = name;
} else {
full = PyUnicode_FromFormat("%U.%U", module, name);
}
out:
Py_XDECREF(module);
Py_XDECREF(name);
return full;
}
// Get the type of a value as a string, expanding tuples to include
// all the element types.
static PyObject *CPy_FormatTypeName(PyObject *value) {
if (Py_IsNone(value)) {
return PyUnicode_FromString("None");
}
if (!PyTuple_CheckExact(value)) {
return CPy_GetTypeName((PyObject *)Py_TYPE(value));
}
if (PyTuple_GET_SIZE(value) > 10) {
return PyUnicode_FromFormat("tuple[<%d items>]", PyTuple_GET_SIZE(value));
}
// Most of the logic is all for tuples, which is the only interesting case
PyObject *output = PyUnicode_FromString("tuple[");
if (!output) {
return NULL;
}
/* This is quadratic but if that ever matters something is really weird. */
int i;
for (i = 0; i < PyTuple_GET_SIZE(value); i++) {
PyObject *s = CPy_FormatTypeName(PyTuple_GET_ITEM(value, i));
if (!s) {
Py_DECREF(output);
return NULL;
}
PyObject *next = PyUnicode_FromFormat("%U%U%s", output, s,
i + 1 == PyTuple_GET_SIZE(value) ? "]" : ", ");
Py_DECREF(output);
Py_DECREF(s);
if (!next) {
return NULL;
}
output = next;
}
return output;
}
CPy_NOINLINE
void CPy_TypeError(const char *expected, PyObject *value) {
PyObject *out = CPy_FormatTypeName(value);
if (out) {
PyErr_Format(PyExc_TypeError, "%s object expected; got %U", expected, out);
Py_DECREF(out);
} else {
PyErr_Format(PyExc_TypeError, "%s object expected; and errored formatting real type!",
expected);
}
}
// The PyFrameObject type definition (struct _frame) has been moved
// to the internal C API: to the pycore_frame.h header file.
// https://github.com/python/cpython/pull/31530
#if PY_VERSION_HEX >= 0x030b00a6
#include "internal/pycore_frame.h"
#endif
// This function is basically exactly the same with _PyTraceback_Add
// which is available in all the versions we support.
// We're continuing to use this because we'll probably optimize this later.
void CPy_AddTraceback(const char *filename, const char *funcname, int line, PyObject *globals) {
PyObject *exc, *val, *tb;
PyThreadState *thread_state = PyThreadState_GET();
PyFrameObject *frame_obj;
// We need to save off the exception state because in 3.8,
// PyFrame_New fails if there is an error set and it fails to look
// up builtins in the globals. (_PyTraceback_Add documents that it
// needs to do it because it decodes the filename according to the
// FS encoding, which could have a decoder in Python. We don't do
// that so *that* doesn't apply to us.)
PyErr_Fetch(&exc, &val, &tb);
PyCodeObject *code_obj = PyCode_NewEmpty(filename, funcname, line);
if (code_obj == NULL) {
goto error;
}
frame_obj = PyFrame_New(thread_state, code_obj, globals, 0);
if (frame_obj == NULL) {
Py_DECREF(code_obj);
goto error;
}
frame_obj->f_lineno = line;
PyErr_Restore(exc, val, tb);
PyTraceBack_Here(frame_obj);
Py_DECREF(code_obj);
Py_DECREF(frame_obj);
return;
error:
#if CPY_3_12_FEATURES
_PyErr_ChainExceptions1(exc);
#else
_PyErr_ChainExceptions(exc, val, tb);
#endif
}
CPy_NOINLINE
void CPy_TypeErrorTraceback(const char *filename, const char *funcname, int line,
PyObject *globals, const char *expected, PyObject *value) {
CPy_TypeError(expected, value);
CPy_AddTraceback(filename, funcname, line, globals);
}
void CPy_AttributeError(const char *filename, const char *funcname, const char *classname,
const char *attrname, int line, PyObject *globals) {
char buf[500];
snprintf(buf, sizeof(buf), "attribute '%.200s' of '%.200s' undefined", attrname, classname);
PyErr_SetString(PyExc_AttributeError, buf);
CPy_AddTraceback(filename, funcname, line, globals);
}

View File

@@ -0,0 +1,239 @@
// Float primitive operations
//
// These are registered in mypyc.primitives.float_ops.
#include <Python.h>
#include "CPy.h"
static double CPy_DomainError(void) {
PyErr_SetString(PyExc_ValueError, "math domain error");
return CPY_FLOAT_ERROR;
}
static double CPy_MathRangeError(void) {
PyErr_SetString(PyExc_OverflowError, "math range error");
return CPY_FLOAT_ERROR;
}
static double CPy_MathExpectedNonNegativeInputError(double x) {
char *buf = PyOS_double_to_string(x, 'r', 0, Py_DTSF_ADD_DOT_0, NULL);
if (buf) {
PyErr_Format(PyExc_ValueError, "expected a nonnegative input, got %s", buf);
PyMem_Free(buf);
}
return CPY_FLOAT_ERROR;
}
static double CPy_MathExpectedPositiveInputError(double x) {
char *buf = PyOS_double_to_string(x, 'r', 0, Py_DTSF_ADD_DOT_0, NULL);
if (buf) {
PyErr_Format(PyExc_ValueError, "expected a positive input, got %s", buf);
PyMem_Free(buf);
}
return CPY_FLOAT_ERROR;
}
static double CPy_MathExpectedFiniteInput(double x) {
char *buf = PyOS_double_to_string(x, 'r', 0, Py_DTSF_ADD_DOT_0, NULL);
if (buf) {
PyErr_Format(PyExc_ValueError, "expected a finite input, got %s", buf);
PyMem_Free(buf);
}
return CPY_FLOAT_ERROR;
}
double CPyFloat_FromTagged(CPyTagged x) {
if (CPyTagged_CheckShort(x)) {
return CPyTagged_ShortAsSsize_t(x);
}
double result = PyFloat_AsDouble(CPyTagged_LongAsObject(x));
if (unlikely(result == -1.0) && PyErr_Occurred()) {
return CPY_FLOAT_ERROR;
}
return result;
}
double CPyFloat_Sin(double x) {
double v = sin(x);
if (unlikely(isnan(v)) && !isnan(x)) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedFiniteInput(x);
#else
return CPy_DomainError();
#endif
}
return v;
}
double CPyFloat_Cos(double x) {
double v = cos(x);
if (unlikely(isnan(v)) && !isnan(x)) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedFiniteInput(x);
#else
return CPy_DomainError();
#endif
}
return v;
}
double CPyFloat_Tan(double x) {
if (unlikely(isinf(x))) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedFiniteInput(x);
#else
return CPy_DomainError();
#endif
}
return tan(x);
}
double CPyFloat_Sqrt(double x) {
if (x < 0.0) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedNonNegativeInputError(x);
#else
return CPy_DomainError();
#endif
}
return sqrt(x);
}
double CPyFloat_Exp(double x) {
double v = exp(x);
if (unlikely(v == INFINITY) && x != INFINITY) {
return CPy_MathRangeError();
}
return v;
}
double CPyFloat_Log(double x) {
if (x <= 0.0) {
#if CPY_3_14_FEATURES
return CPy_MathExpectedPositiveInputError(x);
#else
return CPy_DomainError();
#endif
}
return log(x);
}
CPyTagged CPyFloat_Floor(double x) {
double v = floor(x);
return CPyTagged_FromFloat(v);
}
CPyTagged CPyFloat_Ceil(double x) {
double v = ceil(x);
return CPyTagged_FromFloat(v);
}
bool CPyFloat_IsInf(double x) {
return isinf(x) != 0;
}
bool CPyFloat_IsNaN(double x) {
return isnan(x) != 0;
}
// From CPython 3.10.0, Objects/floatobject.c
static void
_float_div_mod(double vx, double wx, double *floordiv, double *mod)
{
double div;
*mod = fmod(vx, wx);
/* fmod is typically exact, so vx-mod is *mathematically* an
exact multiple of wx. But this is fp arithmetic, and fp
vx - mod is an approximation; the result is that div may
not be an exact integral value after the division, although
it will always be very close to one.
*/
div = (vx - *mod) / wx;
if (*mod) {
/* ensure the remainder has the same sign as the denominator */
if ((wx < 0) != (*mod < 0)) {
*mod += wx;
div -= 1.0;
}
}
else {
/* the remainder is zero, and in the presence of signed zeroes
fmod returns different results across platforms; ensure
it has the same sign as the denominator. */
*mod = copysign(0.0, wx);
}
/* snap quotient to nearest integral value */
if (div) {
*floordiv = floor(div);
if (div - *floordiv > 0.5) {
*floordiv += 1.0;
}
}
else {
/* div is zero - get the same sign as the true quotient */
*floordiv = copysign(0.0, vx / wx); /* zero w/ sign of vx/wx */
}
}
double CPyFloat_FloorDivide(double x, double y) {
double mod, floordiv;
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "float floor division by zero");
return CPY_FLOAT_ERROR;
}
_float_div_mod(x, y, &floordiv, &mod);
return floordiv;
}
// Adapted from CPython 3.10.7
double CPyFloat_Pow(double x, double y) {
if (!isfinite(x) || !isfinite(y)) {
if (isnan(x))
return y == 0.0 ? 1.0 : x; /* NaN**0 = 1 */
else if (isnan(y))
return x == 1.0 ? 1.0 : y; /* 1**NaN = 1 */
else if (isinf(x)) {
int odd_y = isfinite(y) && fmod(fabs(y), 2.0) == 1.0;
if (y > 0.0)
return odd_y ? x : fabs(x);
else if (y == 0.0)
return 1.0;
else /* y < 0. */
return odd_y ? copysign(0.0, x) : 0.0;
}
else if (isinf(y)) {
if (fabs(x) == 1.0)
return 1.0;
else if (y > 0.0 && fabs(x) > 1.0)
return y;
else if (y < 0.0 && fabs(x) < 1.0) {
#if PY_VERSION_HEX < 0x030B0000
if (x == 0.0) { /* 0**-inf: divide-by-zero */
return CPy_DomainError();
}
#endif
return -y; /* result is +inf */
} else
return 0.0;
}
}
double r = pow(x, y);
if (!isfinite(r)) {
if (isnan(r)) {
return CPy_DomainError();
}
/*
an infinite result here arises either from:
(A) (+/-0.)**negative (-> divide-by-zero)
(B) overflow of x**y with x and y finite
*/
else if (isinf(r)) {
if (x == 0.0)
return CPy_DomainError();
else
return CPy_MathRangeError();
}
}
return r;
}

View File

@@ -0,0 +1,84 @@
// Generic primitive operations
//
// These are registered in mypyc.primitives.generic_ops.
#include <Python.h>
#include "CPy.h"
CPyTagged CPyObject_Hash(PyObject *o) {
Py_hash_t h = PyObject_Hash(o);
if (h == -1) {
return CPY_INT_TAG;
} else {
// This is tragically annoying. The range of hash values in
// 64-bit python covers 64-bits, and our short integers only
// cover 63. This means that half the time we are boxing the
// result for basically no good reason. To add insult to
// injury it is probably about to be immediately unboxed by a
// tp_hash wrapper.
return CPyTagged_FromSsize_t(h);
}
}
PyObject *CPyObject_GetAttr3(PyObject *v, PyObject *name, PyObject *defl)
{
PyObject *result = PyObject_GetAttr(v, name);
if (!result && PyErr_ExceptionMatches(PyExc_AttributeError)) {
PyErr_Clear();
Py_INCREF(defl);
result = defl;
}
return result;
}
PyObject *CPyIter_Next(PyObject *iter)
{
return (*Py_TYPE(iter)->tp_iternext)(iter);
}
PyObject *CPyNumber_Power(PyObject *base, PyObject *index)
{
return PyNumber_Power(base, index, Py_None);
}
PyObject *CPyNumber_InPlacePower(PyObject *base, PyObject *index)
{
return PyNumber_InPlacePower(base, index, Py_None);
}
PyObject *CPyObject_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
PyObject *start_obj = CPyTagged_AsObject(start);
PyObject *end_obj = CPyTagged_AsObject(end);
if (unlikely(start_obj == NULL || end_obj == NULL)) {
return NULL;
}
PyObject *slice = PySlice_New(start_obj, end_obj, NULL);
Py_DECREF(start_obj);
Py_DECREF(end_obj);
if (unlikely(slice == NULL)) {
return NULL;
}
PyObject *result = PyObject_GetItem(obj, slice);
Py_DECREF(slice);
return result;
}
typedef PyObject *(*SetupFunction)(PyObject *);
PyObject *CPy_SetupObject(PyObject *type) {
PyTypeObject *tp = (PyTypeObject *)type;
PyMethodDef *def = NULL;
for(; tp; tp = tp->tp_base) {
def = tp->tp_methods;
if (!def || !def->ml_name) {
continue;
}
if (!strcmp(def->ml_name, "__internal_mypyc_setup")) {
return ((SetupFunction)(void(*)(void))def->ml_meth)(type);
}
}
PyErr_SetString(PyExc_RuntimeError, "Internal mypyc error: Unable to find object setup function");
return NULL;
}

View File

@@ -0,0 +1,451 @@
/* getargs implementation copied from Python 3.8 and stripped down to only include
* the functions we need.
* We also add support for required kwonly args and accepting *args / **kwargs.
* A good idea would be to also vendor in the Fast versions and get our stuff
* working with *that*.
* Another probably good idea is to strip out all the formatting stuff we don't need
* and then add in custom stuff that we do need.
*
* DOCUMENTATION OF THE EXTENSIONS:
* - Arguments given after a @ format specify are required keyword-only arguments.
* The | and $ specifiers must both appear before @.
* - If the first character of a format string is %, then the function can support
* *args and **kwargs. On seeing a %, the parser will consume two arguments,
* which should be pointers to variables to store the *args and **kwargs, respectively.
* Either pointer can be NULL, in which case the function doesn't take that
* variety of vararg.
* Unlike most format specifiers, the caller takes ownership of these objects
* and is responsible for decrefing them.
* - All arguments must use the 'O' format.
* - There's minimal error checking of format strings. They are generated
* programmatically and can be assumed valid.
*/
// These macro definitions are copied from pyport.h in Python 3.9 and later
// https://bugs.python.org/issue19569
#if defined(__clang__)
#define _Py_COMP_DIAG_PUSH _Pragma("clang diagnostic push")
#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
_Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
#define _Py_COMP_DIAG_POP _Pragma("clang diagnostic pop")
#elif defined(__GNUC__) \
&& ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 6))
#define _Py_COMP_DIAG_PUSH _Pragma("GCC diagnostic push")
#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS \
_Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
#define _Py_COMP_DIAG_POP _Pragma("GCC diagnostic pop")
#elif defined(_MSC_VER)
#define _Py_COMP_DIAG_PUSH __pragma(warning(push))
#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS __pragma(warning(disable: 4996))
#define _Py_COMP_DIAG_POP __pragma(warning(pop))
#else
#define _Py_COMP_DIAG_PUSH
#define _Py_COMP_DIAG_IGNORE_DEPR_DECLS
#define _Py_COMP_DIAG_POP
#endif
#include "Python.h"
#include "pythonsupport.h"
#include <ctype.h>
#include <float.h>
#ifndef PyDict_GET_SIZE
#define PyDict_GET_SIZE(d) PyDict_Size(d)
#endif
#ifdef __cplusplus
extern "C" {
#endif
int CPyArg_ParseTupleAndKeywords(PyObject *, PyObject *,
const char *, const char *, const char * const *, ...);
/* Forward */
static int vgetargskeywords(PyObject *, PyObject *,
const char *, const char *, const char * const *, va_list *);
static void skipitem(const char **, va_list *);
/* Support for keyword arguments donated by
Geoff Philbrick <philbric@delphi.hks.com> */
/* Return false (0) for error, else true. */
int
CPyArg_ParseTupleAndKeywords(PyObject *args,
PyObject *keywords,
const char *format,
const char *fname,
const char * const *kwlist, ...)
{
int retval;
va_list va;
va_start(va, kwlist);
retval = vgetargskeywords(args, keywords, format, fname, kwlist, &va);
va_end(va);
return retval;
}
#define IS_END_OF_FORMAT(c) (c == '\0' || c == ';' || c == ':')
static int
vgetargskeywords(PyObject *args, PyObject *kwargs, const char *format,
const char *fname, const char * const *kwlist, va_list *p_va)
{
int min = INT_MAX;
int max = INT_MAX;
int required_kwonly_start = INT_MAX;
int has_required_kws = 0;
int i, pos, len;
int skip = 0;
Py_ssize_t nargs, nkwargs;
PyObject *current_arg;
int bound_pos_args;
PyObject **p_args = NULL, **p_kwargs = NULL;
assert(args != NULL && PyTuple_Check(args));
assert(kwargs == NULL || PyDict_Check(kwargs));
assert(format != NULL);
assert(kwlist != NULL);
assert(p_va != NULL);
/* scan kwlist and count the number of positional-only parameters */
for (pos = 0; kwlist[pos] && !*kwlist[pos]; pos++) {
}
/* scan kwlist and get greatest possible nbr of args */
for (len = pos; kwlist[len]; len++) {
#ifdef DEBUG
if (!*kwlist[len]) {
PyErr_SetString(PyExc_SystemError,
"Empty keyword parameter name");
return 0;
}
#endif
}
if (*format == '%') {
p_args = va_arg(*p_va, PyObject **);
p_kwargs = va_arg(*p_va, PyObject **);
format++;
}
nargs = PyTuple_GET_SIZE(args);
nkwargs = (kwargs == NULL) ? 0 : PyDict_GET_SIZE(kwargs);
if (unlikely(nargs + nkwargs > len && !p_args && !p_kwargs)) {
/* Adding "keyword" (when nargs == 0) prevents producing wrong error
messages in some special cases (see bpo-31229). */
PyErr_Format(PyExc_TypeError,
"%.200s%s takes at most %d %sargument%s (%zd given)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
len,
(nargs == 0) ? "keyword " : "",
(len == 1) ? "" : "s",
nargs + nkwargs);
return 0;
}
/* convert tuple args and keyword args in same loop, using kwlist to drive process */
for (i = 0; i < len; i++) {
if (*format == '|') {
#ifdef DEBUG
if (min != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string (| specified twice)");
return 0;
}
#endif
min = i;
format++;
#ifdef DEBUG
if (max != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string ($ before |)");
return 0;
}
#endif
/* If there are optional args, figure out whether we have
* required keyword arguments so that we don't bail without
* enforcing them. */
has_required_kws = strchr(format, '@') != NULL;
}
if (*format == '$') {
#ifdef DEBUG
if (max != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string ($ specified twice)");
return 0;
}
#endif
max = i;
format++;
#ifdef DEBUG
if (max < pos) {
PyErr_SetString(PyExc_SystemError,
"Empty parameter name after $");
return 0;
}
#endif
if (skip) {
/* Now we know the minimal and the maximal numbers of
* positional arguments and can raise an exception with
* informative message (see below). */
break;
}
if (unlikely(max < nargs && !p_args)) {
if (max == 0) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes no positional arguments",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()");
}
else {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes %s %d positional argument%s"
" (%zd given)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
(min < max) ? "at most" : "exactly",
max,
max == 1 ? "" : "s",
nargs);
}
return 0;
}
}
if (*format == '@') {
#ifdef DEBUG
if (min == INT_MAX && max == INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string "
"(@ without preceding | and $)");
return 0;
}
if (required_kwonly_start != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string (@ specified twice)");
return 0;
}
#endif
required_kwonly_start = i;
format++;
}
#ifdef DEBUG
if (IS_END_OF_FORMAT(*format)) {
PyErr_Format(PyExc_SystemError,
"More keyword list entries (%d) than "
"format specifiers (%d)", len, i);
return 0;
}
#endif
if (!skip) {
if (i < nargs && i < max) {
current_arg = Py_NewRef(PyTuple_GET_ITEM(args, i));
}
else if (nkwargs && i >= pos) {
if (unlikely(PyDict_GetItemStringRef(kwargs, kwlist[i], &current_arg) < 0)) {
return 0;
}
if (current_arg) {
--nkwargs;
}
}
else {
current_arg = NULL;
}
if (current_arg) {
PyObject **p = va_arg(*p_va, PyObject **);
*p = current_arg;
Py_DECREF(current_arg);
format++;
continue;
}
if (i < min || i >= required_kwonly_start) {
if (likely(i < pos)) {
assert (min == INT_MAX);
assert (max == INT_MAX);
skip = 1;
/* At that moment we still don't know the minimal and
* the maximal numbers of positional arguments. Raising
* an exception is deferred until we encounter | and $
* or the end of the format. */
}
else {
if (i >= max) {
PyErr_Format(PyExc_TypeError,
"%.200s%s missing required "
"keyword-only argument '%s'",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
kwlist[i]);
}
else {
PyErr_Format(PyExc_TypeError,
"%.200s%s missing required "
"argument '%s' (pos %d)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
kwlist[i], i+1);
}
return 0;
}
}
/* current code reports success when all required args
* fulfilled and no keyword args left, with no further
* validation. XXX Maybe skip this in debug build ?
*/
if (!nkwargs && !skip && !has_required_kws &&
!p_args && !p_kwargs)
{
return 1;
}
}
/* We are into optional args, skip through to any remaining
* keyword args */
skipitem(&format, p_va);
}
if (unlikely(skip)) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes %s %d positional argument%s"
" (%zd given)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
(Py_MIN(pos, min) < i) ? "at least" : "exactly",
Py_MIN(pos, min),
Py_MIN(pos, min) == 1 ? "" : "s",
nargs);
return 0;
}
#ifdef DEBUG
if (!IS_END_OF_FORMAT(*format) &&
(*format != '|') && (*format != '$') && (*format != '@'))
{
PyErr_Format(PyExc_SystemError,
"more argument specifiers than keyword list entries "
"(remaining format:'%s')", format);
return 0;
}
#endif
bound_pos_args = Py_MIN(nargs, Py_MIN(max, len));
if (p_args) {
*p_args = PyTuple_GetSlice(args, bound_pos_args, nargs);
if (!*p_args) {
return 0;
}
}
if (p_kwargs) {
/* This unfortunately needs to be special cased because if len is 0 then we
* never go through the main loop. */
if (unlikely(nargs > 0 && len == 0 && !p_args)) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes no positional arguments",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()");
return 0;
}
*p_kwargs = PyDict_New();
if (!*p_kwargs) {
goto latefail;
}
}
if (nkwargs > 0) {
PyObject *key, *value;
Py_ssize_t j;
/* make sure there are no arguments given by name and position */
for (i = pos; i < bound_pos_args && i < len; i++) {
PyObject *current_arg;
if (unlikely(PyDict_GetItemStringRef(kwargs, kwlist[i], &current_arg) < 0)) {
goto latefail;
}
if (unlikely(current_arg != NULL)) {
Py_DECREF(current_arg);
/* arg present in tuple and in dict */
PyErr_Format(PyExc_TypeError,
"argument for %.200s%s given by name ('%s') "
"and position (%d)",
(fname == NULL) ? "function" : fname,
(fname == NULL) ? "" : "()",
kwlist[i], i+1);
goto latefail;
}
}
/* make sure there are no extraneous keyword arguments */
j = 0;
while (PyDict_Next(kwargs, &j, &key, &value)) {
int match = 0;
if (unlikely(!PyUnicode_Check(key))) {
PyErr_SetString(PyExc_TypeError,
"keywords must be strings");
goto latefail;
}
for (i = pos; i < len; i++) {
if (PyUnicode_EqualToUTF8(key, kwlist[i])) {
match = 1;
break;
}
}
if (!match) {
if (unlikely(!p_kwargs)) {
PyErr_Format(PyExc_TypeError,
"'%U' is an invalid keyword "
"argument for %.200s%s",
key,
(fname == NULL) ? "this function" : fname,
(fname == NULL) ? "" : "()");
goto latefail;
} else {
if (PyDict_SetItem(*p_kwargs, key, value) < 0) {
goto latefail;
}
}
}
}
}
return 1;
/* Handle failures that have happened after we have tried to
* create *args and **kwargs, if they exist. */
latefail:
if (p_args) {
Py_XDECREF(*p_args);
}
if (p_kwargs) {
Py_XDECREF(*p_kwargs);
}
return 0;
}
static void
skipitem(const char **p_format, va_list *p_va)
{
const char *format = *p_format;
char c = *format++;
if (p_va != NULL) {
(void) va_arg(*p_va, PyObject **);
}
*p_format = format;
}
#ifdef __cplusplus
};
#endif

View File

@@ -0,0 +1,569 @@
/* getargskeywordsfast implementation copied from Python 3.9 and stripped down to
* only include the functionality we need.
*
* We also add support for required kwonly args and accepting *args / **kwargs.
*
* DOCUMENTATION OF THE EXTENSIONS:
* - Arguments given after a @ format specify required keyword-only arguments.
* The | and $ specifiers must both appear before @.
* - If the first character of a format string is %, then the function can support
* *args and/or **kwargs. In this case the parser will consume two arguments,
* which should be pointers to variables to store the *args and **kwargs, respectively.
* Either pointer can be NULL, in which case the function doesn't take that
* variety of vararg.
* Unlike most format specifiers, the caller takes ownership of these objects
* and is responsible for decrefing them.
*/
#include <Python.h>
#include "CPy.h"
#define PARSER_INITED(parser) ((parser)->kwtuple != NULL)
/* Forward */
static int
vgetargskeywordsfast_impl(PyObject *const *args, Py_ssize_t nargs,
PyObject *kwargs, PyObject *kwnames,
CPyArg_Parser *parser,
va_list *p_va);
static void skipitem_fast(const char **, va_list *);
/* Parse args for an arbitrary signature */
int
CPyArg_ParseStackAndKeywords(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...)
{
int retval;
va_list va;
va_start(va, parser);
retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
va_end(va);
return retval;
}
/* Parse args for a function that takes no args */
int
CPyArg_ParseStackAndKeywordsNoArgs(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...)
{
int retval;
va_list va;
va_start(va, parser);
if (nargs == 0 && kwnames == NULL) {
// Fast path: no arguments
retval = 1;
} else {
retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
}
va_end(va);
return retval;
}
/* Parse args for a function that takes one arg */
int
CPyArg_ParseStackAndKeywordsOneArg(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...)
{
int retval;
va_list va;
va_start(va, parser);
if (kwnames == NULL && nargs == 1) {
// Fast path: one positional argument
PyObject **p;
p = va_arg(va, PyObject **);
*p = args[0];
retval = 1;
} else {
retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
}
va_end(va);
return retval;
}
/* Parse args for a function that takes no keyword-only args, *args or **kwargs */
int
CPyArg_ParseStackAndKeywordsSimple(PyObject *const *args, Py_ssize_t nargs, PyObject *kwnames,
CPyArg_Parser *parser, ...)
{
int retval;
va_list va;
va_start(va, parser);
if (kwnames == NULL && PARSER_INITED(parser) &&
nargs >= parser->min && nargs <= parser->max) {
// Fast path: correct number of positional arguments only
PyObject **p;
Py_ssize_t i;
for (i = 0; i < nargs; i++) {
p = va_arg(va, PyObject **);
*p = args[i];
}
retval = 1;
} else {
retval = vgetargskeywordsfast_impl(args, nargs, NULL, kwnames, parser, &va);
}
va_end(va);
return retval;
}
#define IS_END_OF_FORMAT(c) (c == '\0' || c == ';' || c == ':')
/* List of static parsers. */
static struct CPyArg_Parser *static_arg_parsers = NULL;
static int
parser_init(CPyArg_Parser *parser)
{
const char * const *keywords;
const char *format, *msg;
int i, len, min, max, nkw;
PyObject *kwtuple;
assert(parser->keywords != NULL);
if (PARSER_INITED(parser)) {
return 1;
}
keywords = parser->keywords;
/* scan keywords and count the number of positional-only parameters */
for (i = 0; keywords[i] && !*keywords[i]; i++) {
}
parser->pos = i;
/* scan keywords and get greatest possible nbr of args */
for (; keywords[i]; i++) {
if (!*keywords[i]) {
PyErr_SetString(PyExc_SystemError,
"Empty keyword parameter name");
return 0;
}
}
len = i;
parser->required_kwonly_start = INT_MAX;
if (*parser->format == '%') {
parser->format++;
parser->varargs = 1;
}
format = parser->format;
if (format) {
/* grab the function name or custom error msg first (mutually exclusive) */
parser->fname = strchr(parser->format, ':');
if (parser->fname) {
parser->fname++;
parser->custom_msg = NULL;
}
else {
parser->custom_msg = strchr(parser->format,';');
if (parser->custom_msg)
parser->custom_msg++;
}
min = max = INT_MAX;
for (i = 0; i < len; i++) {
if (*format == '|') {
if (min != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string (| specified twice)");
return 0;
}
if (max != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string ($ before |)");
return 0;
}
min = i;
format++;
}
if (*format == '$') {
if (max != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string ($ specified twice)");
return 0;
}
if (i < parser->pos) {
PyErr_SetString(PyExc_SystemError,
"Empty parameter name after $");
return 0;
}
max = i;
format++;
}
if (*format == '@') {
if (parser->required_kwonly_start != INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string (@ specified twice)");
return 0;
}
if (min == INT_MAX && max == INT_MAX) {
PyErr_SetString(PyExc_SystemError,
"Invalid format string "
"(@ without preceding | and $)");
return 0;
}
format++;
parser->has_required_kws = 1;
parser->required_kwonly_start = i;
}
if (IS_END_OF_FORMAT(*format)) {
PyErr_Format(PyExc_SystemError,
"More keyword list entries (%d) than "
"format specifiers (%d)", len, i);
return 0;
}
skipitem_fast(&format, NULL);
}
parser->min = Py_MIN(min, len);
parser->max = Py_MIN(max, len);
if (!IS_END_OF_FORMAT(*format) && (*format != '|') && (*format != '$')) {
PyErr_Format(PyExc_SystemError,
"more argument specifiers than keyword list entries "
"(remaining format:'%s')", format);
return 0;
}
}
nkw = len - parser->pos;
kwtuple = PyTuple_New(nkw);
if (kwtuple == NULL) {
return 0;
}
keywords = parser->keywords + parser->pos;
for (i = 0; i < nkw; i++) {
PyObject *str = PyUnicode_FromString(keywords[i]);
if (str == NULL) {
Py_DECREF(kwtuple);
return 0;
}
PyUnicode_InternInPlace(&str);
PyTuple_SET_ITEM(kwtuple, i, str);
}
parser->kwtuple = kwtuple;
assert(parser->next == NULL);
parser->next = static_arg_parsers;
static_arg_parsers = parser;
return 1;
}
static PyObject*
find_keyword(PyObject *kwnames, PyObject *const *kwstack, PyObject *key)
{
Py_ssize_t i, nkwargs;
nkwargs = PyTuple_GET_SIZE(kwnames);
for (i = 0; i < nkwargs; i++) {
PyObject *kwname = PyTuple_GET_ITEM(kwnames, i);
/* kwname == key will normally find a match in since keyword keys
should be interned strings; if not retry below in a new loop. */
if (kwname == key) {
return kwstack[i];
}
}
for (i = 0; i < nkwargs; i++) {
PyObject *kwname = PyTuple_GET_ITEM(kwnames, i);
assert(PyUnicode_Check(kwname));
if (PyUnicode_Equal(kwname, key)) {
return kwstack[i];
}
}
return NULL;
}
static int
vgetargskeywordsfast_impl(PyObject *const *args, Py_ssize_t nargs,
PyObject *kwargs, PyObject *kwnames,
CPyArg_Parser *parser,
va_list *p_va)
{
PyObject *kwtuple;
const char *format;
PyObject *keyword;
int i, pos, len;
Py_ssize_t nkwargs;
PyObject *current_arg;
PyObject *const *kwstack = NULL;
int bound_pos_args;
PyObject **p_args = NULL, **p_kwargs = NULL;
assert(kwargs == NULL || PyDict_Check(kwargs));
assert(kwargs == NULL || kwnames == NULL);
assert(p_va != NULL);
if (!parser_init(parser)) {
return 0;
}
kwtuple = parser->kwtuple;
pos = parser->pos;
len = pos + (int)PyTuple_GET_SIZE(kwtuple);
if (parser->varargs) {
p_args = va_arg(*p_va, PyObject **);
p_kwargs = va_arg(*p_va, PyObject **);
}
if (kwargs != NULL) {
nkwargs = PyDict_GET_SIZE(kwargs);
}
else if (kwnames != NULL) {
nkwargs = PyTuple_GET_SIZE(kwnames);
kwstack = args + nargs;
}
else {
nkwargs = 0;
}
if (nargs + nkwargs > len && !p_args && !p_kwargs) {
/* Adding "keyword" (when nargs == 0) prevents producing wrong error
messages in some special cases (see bpo-31229). */
PyErr_Format(PyExc_TypeError,
"%.200s%s takes at most %d %sargument%s (%zd given)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
len,
(nargs == 0) ? "keyword " : "",
(len == 1) ? "" : "s",
nargs + nkwargs);
return 0;
}
if (parser->max < nargs && !p_args) {
if (parser->max == 0) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes no positional arguments",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()");
}
else {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes %s %d positional argument%s (%zd given)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
(parser->min < parser->max) ? "at most" : "exactly",
parser->max,
parser->max == 1 ? "" : "s",
nargs);
}
return 0;
}
format = parser->format;
/* convert tuple args and keyword args in same loop, using kwtuple to drive process */
for (i = 0; i < len; i++) {
if (*format == '|') {
format++;
}
if (*format == '$') {
format++;
}
if (*format == '@') {
format++;
}
assert(!IS_END_OF_FORMAT(*format));
if (i < nargs && i < parser->max) {
current_arg = args[i];
}
else if (nkwargs && i >= pos) {
keyword = PyTuple_GET_ITEM(kwtuple, i - pos);
if (kwargs != NULL) {
current_arg = PyDict_GetItemWithError(kwargs, keyword);
if (!current_arg && PyErr_Occurred()) {
return 0;
}
}
else {
current_arg = find_keyword(kwnames, kwstack, keyword);
}
if (current_arg) {
--nkwargs;
}
}
else {
current_arg = NULL;
}
if (current_arg) {
PyObject **p = va_arg(*p_va, PyObject **);
*p = current_arg;
format++;
continue;
}
if (i < parser->min || i >= parser->required_kwonly_start) {
/* Less arguments than required */
if (i < pos) {
Py_ssize_t min = Py_MIN(pos, parser->min);
PyErr_Format(PyExc_TypeError,
"%.200s%s takes %s %d positional argument%s"
" (%zd given)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
min < parser->max ? "at least" : "exactly",
min,
min == 1 ? "" : "s",
nargs);
}
else {
keyword = PyTuple_GET_ITEM(kwtuple, i - pos);
if (i >= parser->max) {
PyErr_Format(PyExc_TypeError, "%.200s%s missing required "
"keyword-only argument '%U'",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
keyword);
}
else {
PyErr_Format(PyExc_TypeError, "%.200s%s missing required "
"argument '%U' (pos %d)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
keyword, i+1);
}
}
return 0;
}
/* current code reports success when all required args
* fulfilled and no keyword args left, with no further
* validation. XXX Maybe skip this in debug build ?
*/
if (!nkwargs && !parser->has_required_kws && !p_args && !p_kwargs) {
return 1;
}
/* We are into optional args, skip through to any remaining
* keyword args */
skipitem_fast(&format, p_va);
}
assert(IS_END_OF_FORMAT(*format) || (*format == '|') || (*format == '$'));
bound_pos_args = Py_MIN(nargs, Py_MIN(parser->max, len));
if (p_args) {
*p_args = PyTuple_New(nargs - bound_pos_args);
if (!*p_args) {
return 0;
}
for (i = bound_pos_args; i < nargs; i++) {
PyObject *arg = args[i];
Py_INCREF(arg);
PyTuple_SET_ITEM(*p_args, i - bound_pos_args, arg);
}
}
if (p_kwargs) {
/* This unfortunately needs to be special cased because if len is 0 then we
* never go through the main loop. */
if (nargs > 0 && len == 0 && !p_args) {
PyErr_Format(PyExc_TypeError,
"%.200s%s takes no positional arguments",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()");
return 0;
}
*p_kwargs = PyDict_New();
if (!*p_kwargs) {
goto latefail;
}
}
if (nkwargs > 0) {
Py_ssize_t j;
PyObject *value;
/* make sure there are no arguments given by name and position */
for (i = pos; i < bound_pos_args; i++) {
keyword = PyTuple_GET_ITEM(kwtuple, i - pos);
if (kwargs != NULL) {
current_arg = PyDict_GetItemWithError(kwargs, keyword);
if (!current_arg && PyErr_Occurred()) {
goto latefail;
}
}
else {
current_arg = find_keyword(kwnames, kwstack, keyword);
}
if (current_arg) {
/* arg present in tuple and in dict */
PyErr_Format(PyExc_TypeError,
"argument for %.200s%s given by name ('%U') "
"and position (%d)",
(parser->fname == NULL) ? "function" : parser->fname,
(parser->fname == NULL) ? "" : "()",
keyword, i+1);
goto latefail;
}
}
/* make sure there are no extraneous keyword arguments */
j = 0;
while (1) {
int match;
if (kwargs != NULL) {
if (!PyDict_Next(kwargs, &j, &keyword, &value))
break;
}
else {
if (j >= PyTuple_GET_SIZE(kwnames))
break;
keyword = PyTuple_GET_ITEM(kwnames, j);
value = kwstack[j];
j++;
}
match = PySequence_Contains(kwtuple, keyword);
if (match <= 0) {
if (!match) {
if (!p_kwargs) {
PyErr_Format(PyExc_TypeError,
"'%S' is an invalid keyword "
"argument for %.200s%s",
keyword,
(parser->fname == NULL) ? "this function" : parser->fname,
(parser->fname == NULL) ? "" : "()");
goto latefail;
} else {
if (PyDict_SetItem(*p_kwargs, keyword, value) < 0) {
goto latefail;
}
}
} else {
goto latefail;
}
}
}
}
return 1;
/* Handle failures that have happened after we have tried to
* create *args and **kwargs, if they exist. */
latefail:
if (p_args) {
Py_XDECREF(*p_args);
}
if (p_kwargs) {
Py_XDECREF(*p_kwargs);
}
return 0;
}
static void
skipitem_fast(const char **p_format, va_list *p_va)
{
const char *format = *p_format;
char c = *format++;
if (p_va != NULL) {
(void) va_arg(*p_va, PyObject **);
}
*p_format = format;
}

View File

@@ -0,0 +1,24 @@
#include <Python.h>
#include "CPy.h"
struct ExcDummyStruct _CPy_ExcDummyStruct = { PyObject_HEAD_INIT(NULL) };
PyObject *_CPy_ExcDummy = (PyObject *)&_CPy_ExcDummyStruct;
// System-wide empty tuple constant
PyObject * __mypyc_empty_tuple__ = NULL;
// Because its dynamic linker is more restricted than linux/OS X,
// Windows doesn't allow initializing globals with values from
// other dynamic libraries. This means we need to initialize
// things at load time.
void CPy_Init(void) {
_CPy_ExcDummyStruct.ob_base.ob_type = &PyBaseObject_Type;
// Initialize system-wide empty tuple constant
if (__mypyc_empty_tuple__ == NULL) {
__mypyc_empty_tuple__ = PyTuple_New(0);
if (!__mypyc_empty_tuple__) {
CPyError_OutOfMemory();
}
}
}

View File

@@ -0,0 +1,647 @@
// Int primitive operations (tagged arbitrary-precision integers)
//
// These are registered in mypyc.primitives.int_ops.
#include <Python.h>
#include "CPy.h"
#ifdef _MSC_VER
#include <intrin.h>
#endif
#ifndef _WIN32
// On 64-bit Linux and macOS, ssize_t and long are both 64 bits, and
// PyLong_FromLong is faster than PyLong_FromSsize_t, so use the faster one
#define CPyLong_FromSsize_t PyLong_FromLong
#else
// On 64-bit Windows, ssize_t is 64 bits but long is 32 bits, so we
// can't use the above trick
#define CPyLong_FromSsize_t PyLong_FromSsize_t
#endif
#if defined(__GNUC__) || defined(__clang__)
# if defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__) || (defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 8)
# define CPY_CLZ(x) __builtin_clzll((unsigned long long)(x))
# define CPY_BITS 64
# else
# define CPY_CLZ(x) __builtin_clz((unsigned int)(x))
# define CPY_BITS 32
# endif
#endif
CPyTagged CPyTagged_FromSsize_t(Py_ssize_t value) {
// We use a Python object if the value shifted left by 1 is too
// large for Py_ssize_t
if (unlikely(CPyTagged_TooBig(value))) {
PyObject *object = PyLong_FromSsize_t(value);
return ((CPyTagged)object) | CPY_INT_TAG;
} else {
return value << 1;
}
}
CPyTagged CPyTagged_FromVoidPtr(void *ptr) {
if ((uintptr_t)ptr > PY_SSIZE_T_MAX) {
PyObject *object = PyLong_FromVoidPtr(ptr);
return ((CPyTagged)object) | CPY_INT_TAG;
} else {
return CPyTagged_FromSsize_t((Py_ssize_t)ptr);
}
}
CPyTagged CPyTagged_FromInt64(int64_t value) {
if (unlikely(CPyTagged_TooBigInt64(value))) {
PyObject *object = PyLong_FromLongLong(value);
return ((CPyTagged)object) | CPY_INT_TAG;
} else {
return value << 1;
}
}
PyObject *CPyTagged_AsObject(CPyTagged x) {
PyObject *value;
if (unlikely(CPyTagged_CheckLong(x))) {
value = CPyTagged_LongAsObject(x);
Py_INCREF(value);
} else {
value = CPyLong_FromSsize_t(CPyTagged_ShortAsSsize_t(x));
if (value == NULL) {
CPyError_OutOfMemory();
}
}
return value;
}
PyObject *CPyTagged_StealAsObject(CPyTagged x) {
PyObject *value;
if (unlikely(CPyTagged_CheckLong(x))) {
value = CPyTagged_LongAsObject(x);
} else {
value = CPyLong_FromSsize_t(CPyTagged_ShortAsSsize_t(x));
if (value == NULL) {
CPyError_OutOfMemory();
}
}
return value;
}
Py_ssize_t CPyTagged_AsSsize_t(CPyTagged x) {
if (likely(CPyTagged_CheckShort(x))) {
return CPyTagged_ShortAsSsize_t(x);
} else {
return PyLong_AsSsize_t(CPyTagged_LongAsObject(x));
}
}
CPy_NOINLINE
void CPyTagged_IncRef(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
Py_INCREF(CPyTagged_LongAsObject(x));
}
}
CPy_NOINLINE
void CPyTagged_DecRef(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
Py_DECREF(CPyTagged_LongAsObject(x));
}
}
CPy_NOINLINE
void CPyTagged_XDecRef(CPyTagged x) {
if (unlikely(CPyTagged_CheckLong(x))) {
Py_XDECREF(CPyTagged_LongAsObject(x));
}
}
// Tagged int negation slow path, where the result may be a long integer
CPyTagged CPyTagged_Negate_(CPyTagged num) {
PyObject *num_obj = CPyTagged_AsObject(num);
PyObject *result = PyNumber_Negative(num_obj);
if (result == NULL) {
CPyError_OutOfMemory();
}
Py_DECREF(num_obj);
return CPyTagged_StealFromObject(result);
}
// Tagged int addition slow path, where the result may be a long integer
CPyTagged CPyTagged_Add_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Add(left_obj, right_obj);
if (result == NULL) {
CPyError_OutOfMemory();
}
Py_DECREF(left_obj);
Py_DECREF(right_obj);
return CPyTagged_StealFromObject(result);
}
// Tagged int subtraction slow path, where the result may be a long integer
CPyTagged CPyTagged_Subtract_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Subtract(left_obj, right_obj);
if (result == NULL) {
CPyError_OutOfMemory();
}
Py_DECREF(left_obj);
Py_DECREF(right_obj);
return CPyTagged_StealFromObject(result);
}
// Tagged int multiplication slow path, where the result may be a long integer
CPyTagged CPyTagged_Multiply_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Multiply(left_obj, right_obj);
if (result == NULL) {
CPyError_OutOfMemory();
}
Py_DECREF(left_obj);
Py_DECREF(right_obj);
return CPyTagged_StealFromObject(result);
}
// Tagged int // slow path, where the result may be a long integer (or raise)
CPyTagged CPyTagged_FloorDivide_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_FloorDivide(left_obj, right_obj);
Py_DECREF(left_obj);
Py_DECREF(right_obj);
// Handle exceptions honestly because it could be ZeroDivisionError
if (result == NULL) {
return CPY_INT_TAG;
} else {
return CPyTagged_StealFromObject(result);
}
}
// Tagged int % slow path, where the result may be a long integer (or raise)
CPyTagged CPyTagged_Remainder_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Remainder(left_obj, right_obj);
Py_DECREF(left_obj);
Py_DECREF(right_obj);
// Handle exceptions honestly because it could be ZeroDivisionError
if (result == NULL) {
return CPY_INT_TAG;
} else {
return CPyTagged_StealFromObject(result);
}
}
bool CPyTagged_IsEq_(CPyTagged left, CPyTagged right) {
if (CPyTagged_CheckShort(right)) {
return false;
} else {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
int result = PyObject_RichCompareBool(left_obj, right_obj, Py_EQ);
Py_DECREF(left_obj);
Py_DECREF(right_obj);
if (result == -1) {
CPyError_OutOfMemory();
}
return result;
}
}
bool CPyTagged_IsLt_(CPyTagged left, CPyTagged right) {
PyObject *left_obj = CPyTagged_AsObject(left);
PyObject *right_obj = CPyTagged_AsObject(right);
int result = PyObject_RichCompareBool(left_obj, right_obj, Py_LT);
Py_DECREF(left_obj);
Py_DECREF(right_obj);
if (result == -1) {
CPyError_OutOfMemory();
}
return result;
}
PyObject *CPyLong_FromStrWithBase(PyObject *o, CPyTagged base) {
Py_ssize_t base_size_t = CPyTagged_AsSsize_t(base);
return PyLong_FromUnicodeObject(o, base_size_t);
}
PyObject *CPyLong_FromStr(PyObject *o) {
CPyTagged base = CPyTagged_FromSsize_t(10);
return CPyLong_FromStrWithBase(o, base);
}
CPyTagged CPyTagged_FromFloat(double f) {
if (f < ((double)CPY_TAGGED_MAX + 1.0) && f > (CPY_TAGGED_MIN - 1.0)) {
return (Py_ssize_t)f << 1;
}
PyObject *o = PyLong_FromDouble(f);
if (o == NULL)
return CPY_INT_TAG;
return CPyTagged_StealFromObject(o);
}
PyObject *CPyBool_Str(bool b) {
return PyObject_Str(b ? Py_True : Py_False);
}
// Bitwise op '&', '|' or '^' using the generic (slow) API
static CPyTagged GenericBitwiseOp(CPyTagged a, CPyTagged b, char op) {
PyObject *aobj = CPyTagged_AsObject(a);
PyObject *bobj = CPyTagged_AsObject(b);
PyObject *r;
if (op == '&') {
r = PyNumber_And(aobj, bobj);
} else if (op == '|') {
r = PyNumber_Or(aobj, bobj);
} else {
r = PyNumber_Xor(aobj, bobj);
}
if (unlikely(r == NULL)) {
CPyError_OutOfMemory();
}
Py_DECREF(aobj);
Py_DECREF(bobj);
return CPyTagged_StealFromObject(r);
}
// Return pointer to digits of a PyLong object. If it's a short
// integer, place digits in the buffer buf instead to avoid memory
// allocation (it's assumed to be big enough). Return the number of
// digits in *size. *size is negative if the integer is negative.
static digit *GetIntDigits(CPyTagged n, Py_ssize_t *size, digit *buf) {
if (CPyTagged_CheckShort(n)) {
Py_ssize_t val = CPyTagged_ShortAsSsize_t(n);
bool neg = val < 0;
int len = 1;
if (neg) {
val = -val;
}
buf[0] = val & PyLong_MASK;
if (val > (Py_ssize_t)PyLong_MASK) {
val >>= PyLong_SHIFT;
buf[1] = val & PyLong_MASK;
if (val > (Py_ssize_t)PyLong_MASK) {
buf[2] = val >> PyLong_SHIFT;
len = 3;
} else {
len = 2;
}
}
*size = neg ? -len : len;
return buf;
} else {
PyLongObject *obj = (PyLongObject *)CPyTagged_LongAsObject(n);
*size = CPY_LONG_SIZE_SIGNED(obj);
return &CPY_LONG_DIGIT(obj, 0);
}
}
// Shared implementation of bitwise '&', '|' and '^' (specified by op) for at least
// one long operand. This is somewhat optimized for performance.
CPyTagged CPyTagged_BitwiseLongOp_(CPyTagged a, CPyTagged b, char op) {
// Directly access the digits, as there is no fast C API function for this.
digit abuf[3];
digit bbuf[3];
Py_ssize_t asize;
Py_ssize_t bsize;
digit *adigits = GetIntDigits(a, &asize, abuf);
digit *bdigits = GetIntDigits(b, &bsize, bbuf);
if (unlikely(asize < 0 || bsize < 0)) {
// Negative operand. This is slower, but bitwise ops on them are pretty rare.
return GenericBitwiseOp(a, b, op);
}
// Optimized implementation for two non-negative integers.
// Swap a and b as needed to ensure a is no longer than b.
if (asize > bsize) {
digit *tmp = adigits;
adigits = bdigits;
bdigits = tmp;
Py_ssize_t tmp_size = asize;
asize = bsize;
bsize = tmp_size;
}
void *digits = NULL;
PyLongWriter *writer = PyLongWriter_Create(0, op == '&' ? asize : bsize, &digits);
if (unlikely(writer == NULL)) {
CPyError_OutOfMemory();
}
Py_ssize_t i;
if (op == '&') {
for (i = 0; i < asize; i++) {
((digit *)digits)[i] = adigits[i] & bdigits[i];
}
} else {
if (op == '|') {
for (i = 0; i < asize; i++) {
((digit *)digits)[i] = adigits[i] | bdigits[i];
}
} else {
for (i = 0; i < asize; i++) {
((digit *)digits)[i] = adigits[i] ^ bdigits[i];
}
}
for (; i < bsize; i++) {
((digit *)digits)[i] = bdigits[i];
}
}
return CPyTagged_StealFromObject(PyLongWriter_Finish(writer));
}
// Bitwise '~' slow path
CPyTagged CPyTagged_Invert_(CPyTagged num) {
PyObject *obj = CPyTagged_AsObject(num);
PyObject *result = PyNumber_Invert(obj);
if (unlikely(result == NULL)) {
CPyError_OutOfMemory();
}
Py_DECREF(obj);
return CPyTagged_StealFromObject(result);
}
// Bitwise '>>' slow path
CPyTagged CPyTagged_Rshift_(CPyTagged left, CPyTagged right) {
// Long integer or negative shift -- use generic op
PyObject *lobj = CPyTagged_AsObject(left);
PyObject *robj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Rshift(lobj, robj);
Py_DECREF(lobj);
Py_DECREF(robj);
if (result == NULL) {
// Propagate error (could be negative shift count)
return CPY_INT_TAG;
}
return CPyTagged_StealFromObject(result);
}
// Bitwise '<<' slow path
CPyTagged CPyTagged_Lshift_(CPyTagged left, CPyTagged right) {
// Long integer or out of range shift -- use generic op
PyObject *lobj = CPyTagged_AsObject(left);
PyObject *robj = CPyTagged_AsObject(right);
PyObject *result = PyNumber_Lshift(lobj, robj);
Py_DECREF(lobj);
Py_DECREF(robj);
if (result == NULL) {
// Propagate error (could be negative shift count)
return CPY_INT_TAG;
}
return CPyTagged_StealFromObject(result);
}
// i64 unboxing slow path
int64_t CPyLong_AsInt64_(PyObject *o) {
int overflow;
int64_t result = PyLong_AsLongLongAndOverflow(o, &overflow);
if (result == -1) {
if (PyErr_Occurred()) {
return CPY_LL_INT_ERROR;
} else if (overflow) {
PyErr_SetString(PyExc_OverflowError, "int too large to convert to i64");
return CPY_LL_INT_ERROR;
}
}
return result;
}
int64_t CPyInt64_Divide(int64_t x, int64_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
if (y == -1 && x == INT64_MIN) {
PyErr_SetString(PyExc_OverflowError, "integer division overflow");
return CPY_LL_INT_ERROR;
}
int64_t d = x / y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d * y != x) {
d--;
}
return d;
}
int64_t CPyInt64_Remainder(int64_t x, int64_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
// Edge case: avoid core dump
if (y == -1 && x == INT64_MIN) {
return 0;
}
int64_t d = x % y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d != 0) {
d += y;
}
return d;
}
// i32 unboxing slow path
int32_t CPyLong_AsInt32_(PyObject *o) {
int overflow;
long result = PyLong_AsLongAndOverflow(o, &overflow);
if (result > 0x7fffffffLL || result < -0x80000000LL) {
overflow = 1;
result = -1;
}
if (result == -1) {
if (PyErr_Occurred()) {
return CPY_LL_INT_ERROR;
} else if (overflow) {
PyErr_SetString(PyExc_OverflowError, "int too large to convert to i32");
return CPY_LL_INT_ERROR;
}
}
return result;
}
int32_t CPyInt32_Divide(int32_t x, int32_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
if (y == -1 && x == INT32_MIN) {
PyErr_SetString(PyExc_OverflowError, "integer division overflow");
return CPY_LL_INT_ERROR;
}
int32_t d = x / y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d * y != x) {
d--;
}
return d;
}
int32_t CPyInt32_Remainder(int32_t x, int32_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
// Edge case: avoid core dump
if (y == -1 && x == INT32_MIN) {
return 0;
}
int32_t d = x % y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d != 0) {
d += y;
}
return d;
}
void CPyInt32_Overflow() {
PyErr_SetString(PyExc_OverflowError, "int too large to convert to i32");
}
// i16 unboxing slow path
int16_t CPyLong_AsInt16_(PyObject *o) {
int overflow;
long result = PyLong_AsLongAndOverflow(o, &overflow);
if (result > 0x7fff || result < -0x8000) {
overflow = 1;
result = -1;
}
if (result == -1) {
if (PyErr_Occurred()) {
return CPY_LL_INT_ERROR;
} else if (overflow) {
PyErr_SetString(PyExc_OverflowError, "int too large to convert to i16");
return CPY_LL_INT_ERROR;
}
}
return result;
}
int16_t CPyInt16_Divide(int16_t x, int16_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
if (y == -1 && x == INT16_MIN) {
PyErr_SetString(PyExc_OverflowError, "integer division overflow");
return CPY_LL_INT_ERROR;
}
int16_t d = x / y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d * y != x) {
d--;
}
return d;
}
int16_t CPyInt16_Remainder(int16_t x, int16_t y) {
if (y == 0) {
PyErr_SetString(PyExc_ZeroDivisionError, "integer division or modulo by zero");
return CPY_LL_INT_ERROR;
}
// Edge case: avoid core dump
if (y == -1 && x == INT16_MIN) {
return 0;
}
int16_t d = x % y;
// Adjust for Python semantics
if (((x < 0) != (y < 0)) && d != 0) {
d += y;
}
return d;
}
void CPyInt16_Overflow() {
PyErr_SetString(PyExc_OverflowError, "int too large to convert to i16");
}
// u8 unboxing slow path
uint8_t CPyLong_AsUInt8_(PyObject *o) {
int overflow;
long result = PyLong_AsLongAndOverflow(o, &overflow);
if (result < 0 || result >= 256) {
overflow = 1;
result = -1;
}
if (result == -1) {
if (PyErr_Occurred()) {
return CPY_LL_UINT_ERROR;
} else if (overflow) {
PyErr_SetString(PyExc_OverflowError, "int too large or small to convert to u8");
return CPY_LL_UINT_ERROR;
}
}
return result;
}
void CPyUInt8_Overflow() {
PyErr_SetString(PyExc_OverflowError, "int too large or small to convert to u8");
}
double CPyTagged_TrueDivide(CPyTagged x, CPyTagged y) {
if (unlikely(y == 0)) {
PyErr_SetString(PyExc_ZeroDivisionError, "division by zero");
return CPY_FLOAT_ERROR;
}
if (likely(!CPyTagged_CheckLong(x) && !CPyTagged_CheckLong(y))) {
return (double)((Py_ssize_t)x >> 1) / (double)((Py_ssize_t)y >> 1);
} else {
PyObject *xo = CPyTagged_AsObject(x);
PyObject *yo = CPyTagged_AsObject(y);
PyObject *result = PyNumber_TrueDivide(xo, yo);
if (result == NULL) {
return CPY_FLOAT_ERROR;
}
return PyFloat_AsDouble(result);
}
return 1.0;
}
// int.bit_length()
CPyTagged CPyTagged_BitLength(CPyTagged self) {
// Handle zero
if (self == 0) {
return 0;
}
// Fast path for small (tagged) ints
if (CPyTagged_CheckShort(self)) {
Py_ssize_t val = CPyTagged_ShortAsSsize_t(self);
Py_ssize_t absval = val < 0 ? -val : val;
int bits = 0;
if (absval) {
#if defined(_MSC_VER)
#if defined(_WIN64)
unsigned long idx;
if (_BitScanReverse64(&idx, (unsigned __int64)absval)) {
bits = (int)(idx + 1);
}
#else
unsigned long idx;
if (_BitScanReverse(&idx, (unsigned long)absval)) {
bits = (int)(idx + 1);
}
#endif
#elif defined(__GNUC__) || defined(__clang__)
bits = (int)(CPY_BITS - CPY_CLZ(absval));
#else
// Fallback to loop if no builtin
while (absval) {
absval >>= 1;
bits++;
}
#endif
}
return bits << 1;
}
// Slow path for big ints
PyObject *pyint = CPyTagged_AsObject(self);
int bits = _PyLong_NumBits(pyint);
Py_DECREF(pyint);
if (bits < 0) {
// _PyLong_NumBits sets an error on failure
return CPY_INT_TAG;
}
return bits << 1;
}

View File

@@ -0,0 +1,311 @@
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <stdbool.h>
#include "librt_base64.h"
#include "libbase64.h"
#include "pythoncapi_compat.h"
#ifdef MYPYC_EXPERIMENTAL
static PyObject *
b64decode_handle_invalid_input(
PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen);
#define BASE64_MAXBIN ((PY_SSIZE_T_MAX - 3) / 2)
#define STACK_BUFFER_SIZE 1024
static PyObject *
b64encode_internal(PyObject *obj) {
unsigned char *ascii_data;
char *bin_data;
int leftbits = 0;
unsigned char this_ch;
unsigned int leftchar = 0;
Py_ssize_t bin_len, out_len;
PyBytesWriter *writer;
int newline = 0; // TODO
if (!PyBytes_Check(obj)) {
PyErr_SetString(PyExc_TypeError, "base64() expects a bytes object");
return NULL;
}
bin_data = PyBytes_AS_STRING(obj);
bin_len = PyBytes_GET_SIZE(obj);
assert(bin_len >= 0);
if (bin_len > BASE64_MAXBIN) {
PyErr_SetString(PyExc_ValueError, "Too much data for base64 line");
return NULL;
}
Py_ssize_t buflen = 4 * bin_len / 3 + 4;
char *buf;
char stack_buf[STACK_BUFFER_SIZE];
if (buflen <= STACK_BUFFER_SIZE) {
buf = stack_buf;
} else {
buf = PyMem_Malloc(buflen);
if (buf == NULL) {
return PyErr_NoMemory();
}
}
size_t actual_len;
base64_encode(bin_data, bin_len, buf, &actual_len, 0);
PyObject *res = PyBytes_FromStringAndSize(buf, actual_len);
if (buflen > STACK_BUFFER_SIZE)
PyMem_Free(buf);
return res;
}
static PyObject*
b64encode(PyObject *self, PyObject *const *args, size_t nargs) {
if (nargs != 1) {
PyErr_SetString(PyExc_TypeError, "b64encode() takes exactly one argument");
return 0;
}
return b64encode_internal(args[0]);
}
static inline int
is_valid_base64_char(char c, bool allow_padding) {
return ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') ||
(c >= '0' && c <= '9') || (c == '+') || (c == '/') || (allow_padding && c == '='));
}
static PyObject *
b64decode_internal(PyObject *arg) {
const char *src;
Py_ssize_t srclen_ssz;
// Get input pointer and length
if (PyBytes_Check(arg)) {
src = PyBytes_AS_STRING(arg);
srclen_ssz = PyBytes_GET_SIZE(arg);
} else if (PyUnicode_Check(arg)) {
if (!PyUnicode_IS_ASCII(arg)) {
PyErr_SetString(PyExc_ValueError,
"string argument should contain only ASCII characters");
return NULL;
}
src = (const char *)PyUnicode_1BYTE_DATA(arg);
srclen_ssz = PyUnicode_GET_LENGTH(arg);
} else {
PyErr_SetString(PyExc_TypeError,
"argument should be a bytes-like object or ASCII string");
return NULL;
}
// Fast-path: empty input
if (srclen_ssz == 0) {
return PyBytes_FromStringAndSize(NULL, 0);
}
// Quickly ignore invalid characters at the end. Other invalid characters
// are also accepted, but they need a slow path.
while (srclen_ssz > 0 && !is_valid_base64_char(src[srclen_ssz - 1], true)) {
srclen_ssz--;
}
// Compute an output capacity that's at least 3/4 of input, without overflow:
// ceil(3/4 * N) == N - floor(N/4)
size_t srclen = (size_t)srclen_ssz;
size_t max_out = srclen - (srclen / 4);
if (max_out == 0) {
max_out = 1; // defensive (srclen > 0 implies >= 1 anyway)
}
if (max_out > (size_t)PY_SSIZE_T_MAX) {
PyErr_SetString(PyExc_OverflowError, "input too large");
return NULL;
}
// Allocate output bytes (uninitialized) of the max capacity
PyObject *out_bytes = PyBytes_FromStringAndSize(NULL, (Py_ssize_t)max_out);
if (out_bytes == NULL) {
return NULL; // Propagate memory error
}
char *outbuf = PyBytes_AS_STRING(out_bytes);
size_t outlen = max_out;
int ret = base64_decode(src, srclen, outbuf, &outlen, 0);
if (ret != 1) {
if (ret == 0) {
// Slow path: handle non-base64 input
return b64decode_handle_invalid_input(out_bytes, outbuf, max_out, src, srclen);
}
Py_DECREF(out_bytes);
if (ret == -1) {
PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build");
} else {
PyErr_SetString(PyExc_RuntimeError, "base64_decode failed");
}
return NULL;
}
// Sanity-check contract (decoder must not overflow our buffer)
if (outlen > max_out) {
Py_DECREF(out_bytes);
PyErr_SetString(PyExc_RuntimeError, "decoder wrote past output buffer");
return NULL;
}
// Shrink in place to the actual decoded length
if (_PyBytes_Resize(&out_bytes, (Py_ssize_t)outlen) < 0) {
// _PyBytes_Resize sets an exception and may free the old object
return NULL;
}
return out_bytes;
}
// Process non-base64 input by ignoring non-base64 characters, for compatibility
// with stdlib b64decode.
static PyObject *
b64decode_handle_invalid_input(
PyObject *out_bytes, char *outbuf, size_t max_out, const char *src, size_t srclen)
{
// Copy input to a temporary buffer, with non-base64 characters and extra suffix
// characters removed
size_t newbuf_len = 0;
char *newbuf = PyMem_Malloc(srclen);
if (newbuf == NULL) {
Py_DECREF(out_bytes);
return PyErr_NoMemory();
}
// Copy base64 characters and some padding to the new buffer
for (size_t i = 0; i < srclen; i++) {
char c = src[i];
if (is_valid_base64_char(c, false)) {
newbuf[newbuf_len++] = c;
} else if (c == '=') {
// Copy a necessary amount of padding
int remainder = newbuf_len % 4;
if (remainder == 0) {
// No padding needed
break;
}
int numpad = 4 - remainder;
// Check that there is at least the required amount padding (CPython ignores
// extra padding)
while (numpad > 0) {
if (i == srclen || src[i] != '=') {
break;
}
newbuf[newbuf_len++] = '=';
i++;
numpad--;
// Skip non-base64 alphabet characters within padding
while (i < srclen && !is_valid_base64_char(src[i], true)) {
i++;
}
}
break;
}
}
// Stdlib always performs a non-strict padding check
if (newbuf_len % 4 != 0) {
Py_DECREF(out_bytes);
PyMem_Free(newbuf);
PyErr_SetString(PyExc_ValueError, "Incorrect padding");
return NULL;
}
size_t outlen = max_out;
int ret = base64_decode(newbuf, newbuf_len, outbuf, &outlen, 0);
PyMem_Free(newbuf);
if (ret != 1) {
Py_DECREF(out_bytes);
if (ret == 0) {
PyErr_SetString(PyExc_ValueError, "Only base64 data is allowed");
}
if (ret == -1) {
PyErr_SetString(PyExc_NotImplementedError, "base64 codec not available in this build");
} else {
PyErr_SetString(PyExc_RuntimeError, "base64_decode failed");
}
return NULL;
}
// Shrink in place to the actual decoded length
if (_PyBytes_Resize(&out_bytes, (Py_ssize_t)outlen) < 0) {
// _PyBytes_Resize sets an exception and may free the old object
return NULL;
}
return out_bytes;
}
static PyObject*
b64decode(PyObject *self, PyObject *const *args, size_t nargs) {
if (nargs != 1) {
PyErr_SetString(PyExc_TypeError, "b64decode() takes exactly one argument");
return 0;
}
return b64decode_internal(args[0]);
}
#endif
static PyMethodDef librt_base64_module_methods[] = {
#ifdef MYPYC_EXPERIMENTAL
{"b64encode", (PyCFunction)b64encode, METH_FASTCALL, PyDoc_STR("Encode bytes object using Base64.")},
{"b64decode", (PyCFunction)b64decode, METH_FASTCALL, PyDoc_STR("Decode a Base64 encoded bytes object or ASCII string.")},
#endif
{NULL, NULL, 0, NULL}
};
static int
base64_abi_version(void) {
return 0;
}
static int
base64_api_version(void) {
return 0;
}
static int
librt_base64_module_exec(PyObject *m)
{
#ifdef MYPYC_EXPERIMENTAL
// Export mypy internal C API, be careful with the order!
static void *base64_api[LIBRT_BASE64_API_LEN] = {
(void *)base64_abi_version,
(void *)base64_api_version,
(void *)b64encode_internal,
};
PyObject *c_api_object = PyCapsule_New((void *)base64_api, "librt.base64._C_API", NULL);
if (PyModule_Add(m, "_C_API", c_api_object) < 0) {
return -1;
}
#endif
return 0;
}
static PyModuleDef_Slot librt_base64_module_slots[] = {
{Py_mod_exec, librt_base64_module_exec},
#ifdef Py_MOD_GIL_NOT_USED
{Py_mod_gil, Py_MOD_GIL_NOT_USED},
#endif
{0, NULL}
};
static PyModuleDef librt_base64_module = {
.m_base = PyModuleDef_HEAD_INIT,
.m_name = "base64",
.m_doc = "Fast base64 encoding and decoding optimized for mypyc",
.m_size = 0,
.m_methods = librt_base64_module_methods,
.m_slots = librt_base64_module_slots,
};
PyMODINIT_FUNC
PyInit_base64(void)
{
return PyModuleDef_Init(&librt_base64_module);
}

View File

@@ -0,0 +1,60 @@
#ifndef LIBRT_BASE64_H
#define LIBRT_BASE64_H
#ifndef MYPYC_EXPERIMENTAL
static int
import_librt_base64(void)
{
// All librt.base64 features are experimental for now, so don't set up the API here
return 0;
}
#else // MYPYC_EXPERIMENTAL
#define LIBRT_BASE64_ABI_VERSION 0
#define LIBRT_BASE64_API_VERSION 0
#define LIBRT_BASE64_API_LEN 3
static void *LibRTBase64_API[LIBRT_BASE64_API_LEN];
#define LibRTBase64_ABIVersion (*(int (*)(void)) LibRTBase64_API[0])
#define LibRTBase64_APIVersion (*(int (*)(void)) LibRTBase64_API[1])
#define LibRTBase64_b64encode_internal (*(PyObject* (*)(PyObject *source)) LibRTBase64_API[2])
static int
import_librt_base64(void)
{
PyObject *mod = PyImport_ImportModule("librt.base64");
if (mod == NULL)
return -1;
Py_DECREF(mod); // we import just for the side effect of making the below work.
void *capsule = PyCapsule_Import("librt.base64._C_API", 0);
if (capsule == NULL)
return -1;
memcpy(LibRTBase64_API, capsule, sizeof(LibRTBase64_API));
if (LibRTBase64_ABIVersion() != LIBRT_BASE64_ABI_VERSION) {
char err[128];
snprintf(err, sizeof(err), "ABI version conflict for librt.base64, expected %d, found %d",
LIBRT_BASE64_ABI_VERSION,
LibRTBase64_ABIVersion()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
if (LibRTBase64_APIVersion() < LIBRT_BASE64_API_VERSION) {
char err[128];
snprintf(err, sizeof(err),
"API version conflict for librt.base64, expected %d or newer, found %d (hint: upgrade librt)",
LIBRT_BASE64_API_VERSION,
LibRTBase64_APIVersion()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
return 0;
}
#endif // MYPYC_EXPERIMENTAL
#endif // LIBRT_BASE64_H

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,111 @@
#ifndef LIBRT_INTERNAL_H
#define LIBRT_INTERNAL_H
// ABI version -- only an exact match is compatible. This will only be changed in
// very exceptional cases (likely never) due to strict backward compatibility
// requirements.
#define LIBRT_INTERNAL_ABI_VERSION 2
// API version -- more recent versions must maintain backward compatibility, i.e.
// we can add new features but not remove or change existing features (unless
// ABI version is changed, but see the comment above).
#define LIBRT_INTERNAL_API_VERSION 0
// Number of functions in the capsule API. If you add a new function, also increase
// LIBRT_INTERNAL_API_VERSION.
#define LIBRT_INTERNAL_API_LEN 20
#ifdef LIBRT_INTERNAL_MODULE
static PyObject *ReadBuffer_internal(PyObject *source);
static PyObject *WriteBuffer_internal(void);
static PyObject *WriteBuffer_getvalue_internal(PyObject *self);
static PyObject *ReadBuffer_internal(PyObject *source);
static PyObject *ReadBuffer_internal_empty(void);
static char write_bool_internal(PyObject *data, char value);
static char read_bool_internal(PyObject *data);
static char write_str_internal(PyObject *data, PyObject *value);
static PyObject *read_str_internal(PyObject *data);
static char write_float_internal(PyObject *data, double value);
static double read_float_internal(PyObject *data);
static char write_int_internal(PyObject *data, CPyTagged value);
static CPyTagged read_int_internal(PyObject *data);
static char write_tag_internal(PyObject *data, uint8_t value);
static uint8_t read_tag_internal(PyObject *data);
static int NativeInternal_ABI_Version(void);
static char write_bytes_internal(PyObject *data, PyObject *value);
static PyObject *read_bytes_internal(PyObject *data);
static uint8_t cache_version_internal(void);
static PyTypeObject *ReadBuffer_type_internal(void);
static PyTypeObject *WriteBuffer_type_internal(void);
static int NativeInternal_API_Version(void);
#else
static void *NativeInternal_API[LIBRT_INTERNAL_API_LEN];
#define ReadBuffer_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[0])
#define WriteBuffer_internal (*(PyObject* (*)(void)) NativeInternal_API[1])
#define WriteBuffer_getvalue_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[2])
#define write_bool_internal (*(char (*)(PyObject *source, char value)) NativeInternal_API[3])
#define read_bool_internal (*(char (*)(PyObject *source)) NativeInternal_API[4])
#define write_str_internal (*(char (*)(PyObject *source, PyObject *value)) NativeInternal_API[5])
#define read_str_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[6])
#define write_float_internal (*(char (*)(PyObject *source, double value)) NativeInternal_API[7])
#define read_float_internal (*(double (*)(PyObject *source)) NativeInternal_API[8])
#define write_int_internal (*(char (*)(PyObject *source, CPyTagged value)) NativeInternal_API[9])
#define read_int_internal (*(CPyTagged (*)(PyObject *source)) NativeInternal_API[10])
#define write_tag_internal (*(char (*)(PyObject *source, uint8_t value)) NativeInternal_API[11])
#define read_tag_internal (*(uint8_t (*)(PyObject *source)) NativeInternal_API[12])
#define NativeInternal_ABI_Version (*(int (*)(void)) NativeInternal_API[13])
#define write_bytes_internal (*(char (*)(PyObject *source, PyObject *value)) NativeInternal_API[14])
#define read_bytes_internal (*(PyObject* (*)(PyObject *source)) NativeInternal_API[15])
#define cache_version_internal (*(uint8_t (*)(void)) NativeInternal_API[16])
#define ReadBuffer_type_internal (*(PyTypeObject* (*)(void)) NativeInternal_API[17])
#define WriteBuffer_type_internal (*(PyTypeObject* (*)(void)) NativeInternal_API[18])
#define NativeInternal_API_Version (*(int (*)(void)) NativeInternal_API[19])
static int
import_librt_internal(void)
{
PyObject *mod = PyImport_ImportModule("librt.internal");
if (mod == NULL)
return -1;
Py_DECREF(mod); // we import just for the side effect of making the below work.
void *capsule = PyCapsule_Import("librt.internal._C_API", 0);
if (capsule == NULL)
return -1;
memcpy(NativeInternal_API, capsule, sizeof(NativeInternal_API));
if (NativeInternal_ABI_Version() != LIBRT_INTERNAL_ABI_VERSION) {
char err[128];
snprintf(err, sizeof(err), "ABI version conflict for librt.internal, expected %d, found %d",
LIBRT_INTERNAL_ABI_VERSION,
NativeInternal_ABI_Version()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
if (NativeInternal_API_Version() < LIBRT_INTERNAL_API_VERSION) {
char err[128];
snprintf(err, sizeof(err),
"API version conflict for librt.internal, expected %d or newer, found %d (hint: upgrade librt)",
LIBRT_INTERNAL_API_VERSION,
NativeInternal_API_Version()
);
PyErr_SetString(PyExc_ValueError, err);
return -1;
}
return 0;
}
#endif
static inline bool CPyReadBuffer_Check(PyObject *obj) {
return Py_TYPE(obj) == ReadBuffer_type_internal();
}
static inline bool CPyWriteBuffer_Check(PyObject *obj) {
return Py_TYPE(obj) == WriteBuffer_type_internal();
}
#endif // LIBRT_INTERNAL_H

View File

@@ -0,0 +1,406 @@
// List primitive operations
//
// These are registered in mypyc.primitives.list_ops.
#include <Python.h>
#include "CPy.h"
#ifndef Py_TPFLAGS_SEQUENCE
#define Py_TPFLAGS_SEQUENCE (1 << 5)
#endif
PyObject *CPyList_Build(Py_ssize_t len, ...) {
Py_ssize_t i;
PyObject *res = PyList_New(len);
if (res == NULL) {
return NULL;
}
va_list args;
va_start(args, len);
for (i = 0; i < len; i++) {
// Steals the reference
PyObject *value = va_arg(args, PyObject *);
PyList_SET_ITEM(res, i, value);
}
va_end(args);
return res;
}
char CPyList_Clear(PyObject *list) {
if (PyList_CheckExact(list)) {
PyList_Clear(list);
} else {
_Py_IDENTIFIER(clear);
PyObject *name = _PyUnicode_FromId(&PyId_clear);
if (name == NULL) {
return 0;
}
PyObject *res = PyObject_CallMethodNoArgs(list, name);
if (res == NULL) {
return 0;
}
}
return 1;
}
PyObject *CPyList_Copy(PyObject *list) {
if(PyList_CheckExact(list)) {
return PyList_GetSlice(list, 0, PyList_GET_SIZE(list));
}
_Py_IDENTIFIER(copy);
PyObject *name = _PyUnicode_FromId(&PyId_copy);
if (name == NULL) {
return NULL;
}
return PyObject_CallMethodNoArgs(list, name);
}
PyObject *CPyList_GetItemShort(PyObject *list, CPyTagged index) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
}
PyObject *result = PyList_GET_ITEM(list, n);
Py_INCREF(result);
return result;
}
PyObject *CPyList_GetItemShortBorrow(PyObject *list, CPyTagged index) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
}
return PyList_GET_ITEM(list, n);
}
PyObject *CPyList_GetItem(PyObject *list, CPyTagged index) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
}
PyObject *result = PyList_GET_ITEM(list, n);
Py_INCREF(result);
return result;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
}
PyObject *CPyList_GetItemBorrow(PyObject *list, CPyTagged index) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
}
return PyList_GET_ITEM(list, n);
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
}
PyObject *CPyList_GetItemInt64(PyObject *list, int64_t index) {
size_t size = PyList_GET_SIZE(list);
if (likely((uint64_t)index < size)) {
PyObject *result = PyList_GET_ITEM(list, index);
Py_INCREF(result);
return result;
}
if (index >= 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
index += size;
if (index < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
PyObject *result = PyList_GET_ITEM(list, index);
Py_INCREF(result);
return result;
}
PyObject *CPyList_GetItemInt64Borrow(PyObject *list, int64_t index) {
size_t size = PyList_GET_SIZE(list);
if (likely((uint64_t)index < size)) {
return PyList_GET_ITEM(list, index);
}
if (index >= 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
index += size;
if (index < 0) {
PyErr_SetString(PyExc_IndexError, "list index out of range");
return NULL;
}
return PyList_GET_ITEM(list, index);
}
bool CPyList_SetItem(PyObject *list, CPyTagged index, PyObject *value) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyList_GET_SIZE(list);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
return false;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
return false;
}
}
// PyList_SET_ITEM doesn't decref the old element, so we do
Py_DECREF(PyList_GET_ITEM(list, n));
// N.B: Steals reference
PyList_SET_ITEM(list, n, value);
return true;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return false;
}
}
bool CPyList_SetItemInt64(PyObject *list, int64_t index, PyObject *value) {
size_t size = PyList_GET_SIZE(list);
if (unlikely((uint64_t)index >= size)) {
if (index > 0) {
PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
return false;
}
index += size;
if (index < 0) {
PyErr_SetString(PyExc_IndexError, "list assignment index out of range");
return false;
}
}
// PyList_SET_ITEM doesn't decref the old element, so we do
Py_DECREF(PyList_GET_ITEM(list, index));
// N.B: Steals reference
PyList_SET_ITEM(list, index, value);
return true;
}
// This function should only be used to fill in brand new lists.
void CPyList_SetItemUnsafe(PyObject *list, Py_ssize_t index, PyObject *value) {
PyList_SET_ITEM(list, index, value);
}
#ifdef Py_GIL_DISABLED
// The original optimized list.pop implementation doesn't work on free-threaded
// builds, so provide an alternative that is a bit slower but works.
//
// Note that this implementation isn't intended to be atomic.
static inline PyObject *list_pop_index(PyObject *list, Py_ssize_t index) {
PyObject *item = PyList_GetItemRef(list, index);
if (item == NULL) {
return NULL;
}
if (PySequence_DelItem(list, index) < 0) {
Py_DECREF(item);
return NULL;
}
return item;
}
#endif
PyObject *CPyList_PopLast(PyObject *list)
{
#ifdef Py_GIL_DISABLED
// The other implementation causes segfaults on a free-threaded Python 3.14b4 build.
Py_ssize_t index = PyList_GET_SIZE(list) - 1;
return list_pop_index(list, index);
#else
// I tried a specalized version of pop_impl for just removing the
// last element and it wasn't any faster in microbenchmarks than
// the generic one so I ditched it.
return list_pop_impl((PyListObject *)list, -1);
#endif
}
PyObject *CPyList_Pop(PyObject *obj, CPyTagged index)
{
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
#ifdef Py_GIL_DISABLED
// We must use a slower implementation on free-threaded builds.
if (n < 0) {
n += PyList_GET_SIZE(obj);
}
return list_pop_index(obj, n);
#else
return list_pop_impl((PyListObject *)obj, n);
#endif
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
}
CPyTagged CPyList_Count(PyObject *obj, PyObject *value)
{
return list_count((PyListObject *)obj, value);
}
int CPyList_Insert(PyObject *list, CPyTagged index, PyObject *value)
{
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
return PyList_Insert(list, n, value);
}
// The max range doesn't exactly coincide with ssize_t, but we still
// want to keep the error message compatible with CPython.
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return -1;
}
PyObject *CPyList_Extend(PyObject *o1, PyObject *o2) {
if (PyList_Extend(o1, o2) < 0) {
return NULL;
}
Py_RETURN_NONE;
}
// Return -2 or error, -1 if not found, or index of first match otherwise.
static Py_ssize_t _CPyList_Find(PyObject *list, PyObject *obj) {
Py_ssize_t i;
for (i = 0; i < Py_SIZE(list); i++) {
PyObject *item = PyList_GET_ITEM(list, i);
Py_INCREF(item);
int cmp = PyObject_RichCompareBool(item, obj, Py_EQ);
Py_DECREF(item);
if (cmp != 0) {
if (cmp > 0) {
return i;
} else {
return -2;
}
}
}
return -1;
}
int CPyList_Remove(PyObject *list, PyObject *obj) {
Py_ssize_t index = _CPyList_Find(list, obj);
if (index == -2) {
return -1;
}
if (index == -1) {
PyErr_SetString(PyExc_ValueError, "list.remove(x): x not in list");
return -1;
}
return PyList_SetSlice(list, index, index + 1, NULL);
}
CPyTagged CPyList_Index(PyObject *list, PyObject *obj) {
Py_ssize_t index = _CPyList_Find(list, obj);
if (index == -2) {
return CPY_INT_TAG;
}
if (index == -1) {
PyErr_SetString(PyExc_ValueError, "value is not in list");
return CPY_INT_TAG;
}
return index << 1;
}
PyObject *CPySequence_Sort(PyObject *seq) {
PyObject *newlist = PySequence_List(seq);
if (newlist == NULL)
return NULL;
int res = PyList_Sort(newlist);
if (res < 0) {
Py_DECREF(newlist);
return NULL;
}
return newlist;
}
PyObject *CPySequence_Multiply(PyObject *seq, CPyTagged t_size) {
Py_ssize_t size = CPyTagged_AsSsize_t(t_size);
if (size == -1 && PyErr_Occurred()) {
return NULL;
}
return PySequence_Repeat(seq, size);
}
PyObject *CPySequence_RMultiply(CPyTagged t_size, PyObject *seq) {
return CPySequence_Multiply(seq, t_size);
}
PyObject *CPySequence_InPlaceMultiply(PyObject *seq, CPyTagged t_size) {
Py_ssize_t size = CPyTagged_AsSsize_t(t_size);
if (size == -1 && PyErr_Occurred()) {
return NULL;
}
return PySequence_InPlaceRepeat(seq, size);
}
PyObject *CPyList_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
if (likely(PyList_CheckExact(obj)
&& CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
if (startn < 0) {
startn += PyList_GET_SIZE(obj);
}
if (endn < 0) {
endn += PyList_GET_SIZE(obj);
}
return PyList_GetSlice(obj, startn, endn);
}
return CPyObject_GetSlice(obj, start, end);
}
int CPySequence_Check(PyObject *obj) {
return Py_TYPE(obj)->tp_flags & Py_TPFLAGS_SEQUENCE;
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,21 @@
#include <Python.h>
PyMODINIT_FUNC
PyInit_{modname}(void)
{{
PyObject *tmp;
if (!(tmp = PyImport_ImportModule("{libname}"))) return NULL;
PyObject *capsule = PyObject_GetAttrString(tmp, "init_{full_modname}");
Py_DECREF(tmp);
if (capsule == NULL) return NULL;
void *init_func = PyCapsule_GetPointer(capsule, "{libname}.init_{full_modname}");
Py_DECREF(capsule);
if (!init_func) {{
return NULL;
}}
return ((PyObject *(*)(void))init_func)();
}}
// distutils sometimes spuriously tells cl to export CPyInit___init__,
// so provide that so it chills out
PyMODINIT_FUNC PyInit___init__(void) {{ return PyInit_{modname}(); }}

View File

@@ -0,0 +1,41 @@
#include <Python.h>
static int {modname}_exec(PyObject *module)
{{
PyObject *tmp;
if (!(tmp = PyImport_ImportModule("{libname}"))) return -1;
PyObject *capsule = PyObject_GetAttrString(tmp, "exec_{full_modname}");
Py_DECREF(tmp);
if (capsule == NULL) return -1;
void *exec_func = PyCapsule_GetPointer(capsule, "{libname}.exec_{full_modname}");
Py_DECREF(capsule);
if (!exec_func) return -1;
if (((int (*)(PyObject *))exec_func)(module) != 0) return -1;
return 0;
}}
static PyModuleDef_Slot {modname}_slots[] = {{
{{Py_mod_exec, {modname}_exec}},
{{Py_mod_multiple_interpreters, Py_MOD_MULTIPLE_INTERPRETERS_NOT_SUPPORTED}},
{{Py_mod_gil, Py_MOD_GIL_NOT_USED}},
{{0, NULL}},
}};
static struct PyModuleDef {modname}_module = {{
PyModuleDef_HEAD_INIT,
.m_name = "{modname}",
.m_doc = NULL,
.m_methods = NULL,
.m_size = 0,
.m_slots = {modname}_slots,
}};
PyMODINIT_FUNC
PyInit_{modname}(void)
{{
return PyModuleDef_Init(&{modname}_module);
}}
// distutils sometimes spuriously tells cl to export CPyInit___init__,
// so provide that so it chills out
PyMODINIT_FUNC PyInit___init__(void) {{ return PyInit_{modname}(); }}

View File

@@ -0,0 +1,182 @@
#ifndef MYPYC_UTIL_H
#define MYPYC_UTIL_H
#include <Python.h>
#include <frameobject.h>
#include <assert.h>
#if defined(__clang__) || defined(__GNUC__)
#define likely(x) __builtin_expect((x),1)
#define unlikely(x) __builtin_expect((x),0)
#define CPy_Unreachable() __builtin_unreachable()
#else
#define likely(x) (x)
#define unlikely(x) (x)
#define CPy_Unreachable() abort()
#endif
#if defined(__clang__) || defined(__GNUC__)
#define CPy_NOINLINE __attribute__((noinline))
#elif defined(_MSC_VER)
#define CPy_NOINLINE __declspec(noinline)
#else
#define CPy_NOINLINE
#endif
#ifndef Py_GIL_DISABLED
// Everything is running in the same thread, so no need for thread locals
#define CPyThreadLocal
#else
// 1. Use C11 standard thread_local storage, if available
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L && !defined(__STDC_NO_THREADS__)
#define CPyThreadLocal _Thread_local
// 2. Microsoft Visual Studio fallback
#elif defined(_MSC_VER)
#define CPyThreadLocal __declspec(thread)
// 3. GNU thread local storage for GCC/Clang targets that still need it
#elif defined(__GNUC__) || defined(__clang__)
#define CPyThreadLocal __thread
#else
#error "Can't define CPyThreadLocal for this compiler/target (consider using a non-free-threaded Python build)"
#endif
#endif // Py_GIL_DISABLED
// INCREF and DECREF that assert the pointer is not NULL.
// asserts are disabled in release builds so there shouldn't be a perf hit.
// I'm honestly kind of surprised that this isn't done by default.
#define CPy_INCREF(p) do { assert(p); Py_INCREF(p); } while (0)
#define CPy_DECREF(p) do { assert(p); Py_DECREF(p); } while (0)
// Here just for consistency
#define CPy_XDECREF(p) Py_XDECREF(p)
#ifndef Py_GIL_DISABLED
// The *_NO_IMM operations below perform refcount manipulation for
// non-immortal objects (Python 3.12 and later).
//
// Py_INCREF and other CPython operations check for immortality. This
// can be expensive when we know that an object cannot be immortal.
//
// This optimization cannot be performed in free-threaded mode so we
// fall back to just calling the normal incref/decref operations.
static inline void CPy_INCREF_NO_IMM(PyObject *op)
{
op->ob_refcnt++;
}
static inline void CPy_DECREF_NO_IMM(PyObject *op)
{
if (--op->ob_refcnt == 0) {
_Py_Dealloc(op);
}
}
static inline void CPy_XDECREF_NO_IMM(PyObject *op)
{
if (op != NULL && --op->ob_refcnt == 0) {
_Py_Dealloc(op);
}
}
#define CPy_INCREF_NO_IMM(op) CPy_INCREF_NO_IMM((PyObject *)(op))
#define CPy_DECREF_NO_IMM(op) CPy_DECREF_NO_IMM((PyObject *)(op))
#define CPy_XDECREF_NO_IMM(op) CPy_XDECREF_NO_IMM((PyObject *)(op))
#else
#define CPy_INCREF_NO_IMM(op) CPy_INCREF(op)
#define CPy_DECREF_NO_IMM(op) CPy_DECREF(op)
#define CPy_XDECREF_NO_IMM(op) CPy_XDECREF(op)
#endif
// Tagged integer -- our representation of Python 'int' objects.
// Small enough integers are represented as unboxed integers (shifted
// left by 1); larger integers (larger than 63 bits on a 64-bit
// platform) are stored as a tagged pointer (PyObject *)
// representing a Python int object, with the lowest bit set.
// Tagged integers are always normalized. A small integer *must not*
// have the tag bit set.
typedef size_t CPyTagged;
typedef size_t CPyPtr;
#define CPY_INT_BITS (CHAR_BIT * sizeof(CPyTagged))
#define CPY_TAGGED_MAX (((Py_ssize_t)1 << (CPY_INT_BITS - 2)) - 1)
#define CPY_TAGGED_MIN (-((Py_ssize_t)1 << (CPY_INT_BITS - 2)))
#define CPY_TAGGED_ABS_MIN (0-(size_t)CPY_TAGGED_MIN)
typedef PyObject CPyModule;
// Tag bit used for long integers
#define CPY_INT_TAG 1
// Error value for signed fixed-width (low-level) integers
#define CPY_LL_INT_ERROR -113
// Error value for unsigned fixed-width (low-level) integers
#define CPY_LL_UINT_ERROR 239
// Error value for floats
#define CPY_FLOAT_ERROR -113.0
typedef void (*CPyVTableItem)(void);
static inline CPyTagged CPyTagged_ShortFromInt(int x) {
return x << 1;
}
static inline CPyTagged CPyTagged_ShortFromSsize_t(Py_ssize_t x) {
return x << 1;
}
// Are we targeting Python 3.X or newer?
#define CPY_3_11_FEATURES (PY_VERSION_HEX >= 0x030b0000)
#define CPY_3_12_FEATURES (PY_VERSION_HEX >= 0x030c0000)
#define CPY_3_14_FEATURES (PY_VERSION_HEX >= 0x030e0000)
#if CPY_3_12_FEATURES
// Same as macros in CPython internal/pycore_long.h, but with a CPY_ prefix
#define CPY_NON_SIZE_BITS 3
#define CPY_SIGN_ZERO 1
#define CPY_SIGN_NEGATIVE 2
#define CPY_SIGN_MASK 3
#define CPY_LONG_DIGIT(o, n) ((o)->long_value.ob_digit[n])
// Only available on Python 3.12 and later
#define CPY_LONG_TAG(o) ((o)->long_value.lv_tag)
#define CPY_LONG_IS_NEGATIVE(o) (((o)->long_value.lv_tag & CPY_SIGN_MASK) == CPY_SIGN_NEGATIVE)
// Only available on Python 3.12 and later
#define CPY_LONG_SIZE(o) ((o)->long_value.lv_tag >> CPY_NON_SIZE_BITS)
// Number of digits; negative for negative ints
#define CPY_LONG_SIZE_SIGNED(o) (CPY_LONG_IS_NEGATIVE(o) ? -CPY_LONG_SIZE(o) : CPY_LONG_SIZE(o))
// Number of digits, assuming int is non-negative
#define CPY_LONG_SIZE_UNSIGNED(o) CPY_LONG_SIZE(o)
#else
#define CPY_LONG_DIGIT(o, n) ((o)->ob_digit[n])
#define CPY_LONG_IS_NEGATIVE(o) (((o)->ob_base.ob_size < 0)
#define CPY_LONG_SIZE_SIGNED(o) ((o)->ob_base.ob_size)
#define CPY_LONG_SIZE_UNSIGNED(o) ((o)->ob_base.ob_size)
#endif
// Are we targeting Python 3.13 or newer?
#define CPY_3_13_FEATURES (PY_VERSION_HEX >= 0x030d0000)
// Are we targeting Python 3.14 or newer?
#define CPY_3_14_FEATURES (PY_VERSION_HEX >= 0x030e0000)
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,106 @@
// Collects code that was copied in from cpython, for a couple of different reasons:
// * We wanted to modify it to produce a more efficient version for our uses
// * We needed to call it and it was static :(
// * We wanted to call it and needed to backport it
#include "pythonsupport.h"
#if CPY_3_12_FEATURES
// Slow path of CPyLong_AsSsize_tAndOverflow (non-inlined)
Py_ssize_t
CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow)
{
PyLongObject *v = (PyLongObject *)vv;
size_t x, prev;
Py_ssize_t res;
Py_ssize_t i;
int sign;
*overflow = 0;
res = -1;
i = CPY_LONG_TAG(v);
sign = 1;
x = 0;
if (i & CPY_SIGN_NEGATIVE) {
sign = -1;
}
i >>= CPY_NON_SIZE_BITS;
while (--i >= 0) {
prev = x;
x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i);
if ((x >> PyLong_SHIFT) != prev) {
*overflow = sign;
goto exit;
}
}
/* Haven't lost any bits, but casting to long requires extra
* care.
*/
if (x <= (size_t)CPY_TAGGED_MAX) {
res = (Py_ssize_t)x * sign;
}
else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) {
res = CPY_TAGGED_MIN;
}
else {
*overflow = sign;
/* res is already set to -1 */
}
exit:
return res;
}
#else
// Slow path of CPyLong_AsSsize_tAndOverflow (non-inlined, Python 3.11 and earlier)
Py_ssize_t
CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow)
{
/* This version by Tim Peters */
PyLongObject *v = (PyLongObject *)vv;
size_t x, prev;
Py_ssize_t res;
Py_ssize_t i;
int sign;
*overflow = 0;
res = -1;
i = Py_SIZE(v);
sign = 1;
x = 0;
if (i < 0) {
sign = -1;
i = -(i);
}
while (--i >= 0) {
prev = x;
x = (x << PyLong_SHIFT) + CPY_LONG_DIGIT(v, i);
if ((x >> PyLong_SHIFT) != prev) {
*overflow = sign;
goto exit;
}
}
/* Haven't lost any bits, but casting to long requires extra
* care.
*/
if (x <= (size_t)CPY_TAGGED_MAX) {
res = (Py_ssize_t)x * sign;
}
else if (sign < 0 && x == CPY_TAGGED_ABS_MIN) {
res = CPY_TAGGED_MIN;
}
else {
*overflow = sign;
/* res is already set to -1 */
}
exit:
return res;
}
#endif

View File

@@ -0,0 +1,478 @@
// Collects code that was copied in from cpython, for a couple of different reasons:
// * We wanted to modify it to produce a more efficient version for our uses
// * We needed to call it and it was static :(
// * We wanted to call it and needed to backport it
#ifndef CPY_PYTHONSUPPORT_H
#define CPY_PYTHONSUPPORT_H
#include <stdbool.h>
#include <Python.h>
#include "pythoncapi_compat.h"
#include <frameobject.h>
#include <assert.h>
#include "mypyc_util.h"
#if CPY_3_13_FEATURES
#ifndef Py_BUILD_CORE
#define Py_BUILD_CORE
#endif
#include "internal/pycore_genobject.h" // _PyGen_FetchStopIterationValue
#include "internal/pycore_pyerrors.h" // _PyErr_FormatFromCause, _PyErr_SetKeyError
#include "internal/pycore_setobject.h" // _PySet_Update
#endif
#if CPY_3_12_FEATURES
#include "internal/pycore_frame.h"
#endif
#ifdef __cplusplus
extern "C" {
#endif
#if 0
} // why isn't emacs smart enough to not indent this
#endif
/////////////////////////////////////////
// Adapted from bltinmodule.c in Python 3.7.0
_Py_IDENTIFIER(__mro_entries__);
static PyObject*
update_bases(PyObject *bases)
{
Py_ssize_t i, j;
PyObject *base, *meth, *new_base, *result, *new_bases = NULL;
PyObject *stack[1] = {bases};
assert(PyTuple_Check(bases));
Py_ssize_t nargs = PyTuple_GET_SIZE(bases);
for (i = 0; i < nargs; i++) {
base = PyTuple_GET_ITEM(bases, i);
if (PyType_Check(base)) {
if (new_bases) {
/* If we already have made a replacement, then we append every normal base,
otherwise just skip it. */
if (PyList_Append(new_bases, base) < 0) {
goto error;
}
}
continue;
}
if (PyObject_GetOptionalAttrString(base, PyId___mro_entries__.string, &meth) < 0) {
goto error;
}
if (!meth) {
if (new_bases) {
if (PyList_Append(new_bases, base) < 0) {
goto error;
}
}
continue;
}
new_base = PyObject_Vectorcall(meth, stack, 1, NULL);
Py_DECREF(meth);
if (!new_base) {
goto error;
}
if (!PyTuple_Check(new_base)) {
PyErr_SetString(PyExc_TypeError,
"__mro_entries__ must return a tuple");
Py_DECREF(new_base);
goto error;
}
if (!new_bases) {
/* If this is a first successful replacement, create new_bases list and
copy previously encountered bases. */
if (!(new_bases = PyList_New(i))) {
goto error;
}
for (j = 0; j < i; j++) {
base = PyTuple_GET_ITEM(bases, j);
PyList_SET_ITEM(new_bases, j, base);
Py_INCREF(base);
}
}
j = PyList_GET_SIZE(new_bases);
if (PyList_SetSlice(new_bases, j, j, new_base) < 0) {
goto error;
}
Py_DECREF(new_base);
}
if (!new_bases) {
return bases;
}
result = PyList_AsTuple(new_bases);
Py_DECREF(new_bases);
return result;
error:
Py_XDECREF(new_bases);
return NULL;
}
// From Python 3.7's typeobject.c
_Py_IDENTIFIER(__init_subclass__);
static int
init_subclass(PyTypeObject *type, PyObject *kwds)
{
PyObject *super, *func, *result;
PyObject *args[2] = {(PyObject *)type, (PyObject *)type};
super = PyObject_Vectorcall((PyObject *)&PySuper_Type, args, 2, NULL);
if (super == NULL) {
return -1;
}
func = _PyObject_GetAttrId(super, &PyId___init_subclass__);
Py_DECREF(super);
if (func == NULL) {
return -1;
}
result = _PyObject_FastCallDict(func, NULL, 0, kwds);
Py_DECREF(func);
if (result == NULL) {
return -1;
}
Py_DECREF(result);
return 0;
}
Py_ssize_t
CPyLong_AsSsize_tAndOverflow_(PyObject *vv, int *overflow);
#if CPY_3_12_FEATURES
static inline Py_ssize_t
CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow)
{
/* This version by Tim Peters */
PyLongObject *v = (PyLongObject *)vv;
Py_ssize_t res;
Py_ssize_t i;
*overflow = 0;
res = -1;
i = CPY_LONG_TAG(v);
// TODO: Combine zero and non-zero cases helow?
if (likely(i == (1 << CPY_NON_SIZE_BITS))) {
res = CPY_LONG_DIGIT(v, 0);
} else if (likely(i == CPY_SIGN_ZERO)) {
res = 0;
} else if (i == ((1 << CPY_NON_SIZE_BITS) | CPY_SIGN_NEGATIVE)) {
res = -(sdigit)CPY_LONG_DIGIT(v, 0);
} else {
// Slow path is moved to a non-inline helper function to
// limit size of generated code
int overflow_local;
res = CPyLong_AsSsize_tAndOverflow_(vv, &overflow_local);
*overflow = overflow_local;
}
return res;
}
#else
// Adapted from longobject.c in Python 3.7.0
/* This function adapted from PyLong_AsLongLongAndOverflow, but with
* some safety checks removed and specialized to only work for objects
* that are already longs.
* About half of the win this provides, though, just comes from being
* able to inline the function, which in addition to saving function call
* overhead allows the out-parameter overflow flag to be collapsed into
* control flow.
* Additionally, we check against the possible range of CPyTagged, not of
* Py_ssize_t. */
static inline Py_ssize_t
CPyLong_AsSsize_tAndOverflow(PyObject *vv, int *overflow)
{
/* This version by Tim Peters */
PyLongObject *v = (PyLongObject *)vv;
Py_ssize_t res;
Py_ssize_t i;
*overflow = 0;
res = -1;
i = Py_SIZE(v);
if (likely(i == 1)) {
res = CPY_LONG_DIGIT(v, 0);
} else if (likely(i == 0)) {
res = 0;
} else if (i == -1) {
res = -(sdigit)CPY_LONG_DIGIT(v, 0);
} else {
// Slow path is moved to a non-inline helper function to
// limit size of generated code
int overflow_local;
res = CPyLong_AsSsize_tAndOverflow_(vv, &overflow_local);
*overflow = overflow_local;
}
return res;
}
#endif
// Adapted from listobject.c in Python 3.7.0
static int
list_resize(PyListObject *self, Py_ssize_t newsize)
{
PyObject **items;
size_t new_allocated, num_allocated_bytes;
Py_ssize_t allocated = self->allocated;
/* Bypass realloc() when a previous overallocation is large enough
to accommodate the newsize. If the newsize falls lower than half
the allocated size, then proceed with the realloc() to shrink the list.
*/
if (allocated >= newsize && newsize >= (allocated >> 1)) {
assert(self->ob_item != NULL || newsize == 0);
Py_SET_SIZE(self, newsize);
return 0;
}
/* This over-allocates proportional to the list size, making room
* for additional growth. The over-allocation is mild, but is
* enough to give linear-time amortized behavior over a long
* sequence of appends() in the presence of a poorly-performing
* system realloc().
* The growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
* Note: new_allocated won't overflow because the largest possible value
* is PY_SSIZE_T_MAX * (9 / 8) + 6 which always fits in a size_t.
*/
new_allocated = (size_t)newsize + (newsize >> 3) + (newsize < 9 ? 3 : 6);
if (new_allocated > (size_t)PY_SSIZE_T_MAX / sizeof(PyObject *)) {
PyErr_NoMemory();
return -1;
}
if (newsize == 0)
new_allocated = 0;
num_allocated_bytes = new_allocated * sizeof(PyObject *);
items = (PyObject **)PyMem_Realloc(self->ob_item, num_allocated_bytes);
if (items == NULL) {
PyErr_NoMemory();
return -1;
}
self->ob_item = items;
Py_SET_SIZE(self, newsize);
self->allocated = new_allocated;
return 0;
}
// Changed to use PyList_SetSlice instead of the internal list_ass_slice
static PyObject *
list_pop_impl(PyListObject *self, Py_ssize_t index)
{
PyObject *v;
int status;
if (Py_SIZE(self) == 0) {
/* Special-case most common failure cause */
PyErr_SetString(PyExc_IndexError, "pop from empty list");
return NULL;
}
if (index < 0)
index += Py_SIZE(self);
if (index < 0 || index >= Py_SIZE(self)) {
PyErr_SetString(PyExc_IndexError, "pop index out of range");
return NULL;
}
v = self->ob_item[index];
if (index == Py_SIZE(self) - 1) {
status = list_resize(self, Py_SIZE(self) - 1);
if (status >= 0)
return v; /* and v now owns the reference the list had */
else
return NULL;
}
Py_INCREF(v);
status = PyList_SetSlice((PyObject *)self, index, index+1, (PyObject *)NULL);
if (status < 0) {
Py_DECREF(v);
return NULL;
}
return v;
}
// Tweaked to directly use CPyTagged
static CPyTagged
list_count(PyListObject *self, PyObject *value)
{
Py_ssize_t count = 0;
Py_ssize_t i;
for (i = 0; i < Py_SIZE(self); i++) {
int cmp = PyObject_RichCompareBool(self->ob_item[i], value, Py_EQ);
if (cmp > 0)
count++;
else if (cmp < 0)
return CPY_INT_TAG;
}
return CPyTagged_ShortFromSsize_t(count);
}
// Adapted from genobject.c in Python 3.7.2
// Copied because it wasn't in 3.5.2 and it is undocumented anyways.
/*
* Set StopIteration with specified value. Value can be arbitrary object
* or NULL.
*
* Returns 0 if StopIteration is set and -1 if any other exception is set.
*/
static int
CPyGen_SetStopIterationValue(PyObject *value)
{
PyObject *e;
if (value == NULL ||
(!PyTuple_Check(value) && !PyExceptionInstance_Check(value)))
{
/* Delay exception instantiation if we can */
PyErr_SetObject(PyExc_StopIteration, value);
return 0;
}
/* Construct an exception instance manually with
* PyObject_CallOneArg and pass it to PyErr_SetObject.
*
* We do this to handle a situation when "value" is a tuple, in which
* case PyErr_SetObject would set the value of StopIteration to
* the first element of the tuple.
*
* (See PyErr_SetObject/_PyErr_CreateException code for details.)
*/
e = PyObject_CallOneArg(PyExc_StopIteration, value);
if (e == NULL) {
return -1;
}
PyErr_SetObject(PyExc_StopIteration, e);
Py_DECREF(e);
return 0;
}
// Copied from dictobject.c and dictobject.h, these are not Public before
// Python 3.8. Also remove some error checks that we do in the callers.
typedef struct {
PyObject_HEAD
PyDictObject *dv_dict;
} _CPyDictViewObject;
static PyObject *
_CPyDictView_New(PyObject *dict, PyTypeObject *type)
{
_CPyDictViewObject *dv = PyObject_GC_New(_CPyDictViewObject, type);
if (dv == NULL)
return NULL;
Py_INCREF(dict);
dv->dv_dict = (PyDictObject *)dict;
PyObject_GC_Track(dv);
return (PyObject *)dv;
}
#ifdef __cplusplus
}
#endif
#if PY_VERSION_HEX >= 0x030A0000 // 3.10
static int
_CPyObject_HasAttrId(PyObject *v, _Py_Identifier *name) {
PyObject *tmp = NULL;
int result = PyObject_GetOptionalAttrString(v, name->string, &tmp);
if (tmp) {
Py_DECREF(tmp);
}
return result;
}
#else
#define _CPyObject_HasAttrId _PyObject_HasAttrId
#endif
#if CPY_3_12_FEATURES
// These are copied from genobject.c in Python 3.12
static int
gen_is_coroutine(PyObject *o)
{
if (PyGen_CheckExact(o)) {
PyCodeObject *code = PyGen_GetCode((PyGenObject*)o);
if (code->co_flags & CO_ITERABLE_COROUTINE) {
return 1;
}
}
return 0;
}
#else
// Copied from genobject.c in Python 3.10
static int
gen_is_coroutine(PyObject *o)
{
if (PyGen_CheckExact(o)) {
PyCodeObject *code = (PyCodeObject *)((PyGenObject*)o)->gi_code;
if (code->co_flags & CO_ITERABLE_COROUTINE) {
return 1;
}
}
return 0;
}
#endif
/*
* This helper function returns an awaitable for `o`:
* - `o` if `o` is a coroutine-object;
* - `type(o)->tp_as_async->am_await(o)`
*
* Raises a TypeError if it's not possible to return
* an awaitable and returns NULL.
*/
static PyObject *
CPyCoro_GetAwaitableIter(PyObject *o)
{
unaryfunc getter = NULL;
PyTypeObject *ot;
if (PyCoro_CheckExact(o) || gen_is_coroutine(o)) {
/* 'o' is a coroutine. */
Py_INCREF(o);
return o;
}
ot = Py_TYPE(o);
if (ot->tp_as_async != NULL) {
getter = ot->tp_as_async->am_await;
}
if (getter != NULL) {
PyObject *res = (*getter)(o);
if (res != NULL) {
if (PyCoro_CheckExact(res) || gen_is_coroutine(res)) {
/* __await__ must return an *iterator*, not
a coroutine or another awaitable (see PEP 492) */
PyErr_SetString(PyExc_TypeError,
"__await__() returned a coroutine");
Py_CLEAR(res);
} else if (!PyIter_Check(res)) {
PyErr_Format(PyExc_TypeError,
"__await__() returned non-iterator "
"of type '%.100s'",
Py_TYPE(res)->tp_name);
Py_CLEAR(res);
}
}
return res;
}
PyErr_Format(PyExc_TypeError,
"object %.100s can't be used in 'await' expression",
ot->tp_name);
return NULL;
}
#endif

View File

@@ -0,0 +1,17 @@
// Set primitive operations
//
// These are registered in mypyc.primitives.set_ops.
#include <Python.h>
#include "CPy.h"
bool CPySet_Remove(PyObject *set, PyObject *key) {
int success = PySet_Discard(set, key);
if (success == 1) {
return true;
}
if (success == 0) {
_PyErr_SetKeyError(key);
}
return false;
}

View File

@@ -0,0 +1,623 @@
#include "pythoncapi_compat.h"
// String primitive operations
//
// These are registered in mypyc.primitives.str_ops.
#include <Python.h>
#include "CPy.h"
// The _PyUnicode_CheckConsistency definition has been moved to the internal API
// https://github.com/python/cpython/pull/106398
#if defined(Py_DEBUG) && defined(CPY_3_13_FEATURES)
#include "internal/pycore_unicodeobject.h"
#endif
// Copied from cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
#define BLOOM_MASK unsigned long
#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
#if LONG_BIT >= 128
#define BLOOM_WIDTH 128
#elif LONG_BIT >= 64
#define BLOOM_WIDTH 64
#elif LONG_BIT >= 32
#define BLOOM_WIDTH 32
#else
#error "LONG_BIT is smaller than 32"
#endif
// Copied from cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
// This is needed for str.strip("...").
static inline BLOOM_MASK
make_bloom_mask(int kind, const void* ptr, Py_ssize_t len)
{
#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
do { \
TYPE *data = (TYPE *)PTR; \
TYPE *end = data + LEN; \
Py_UCS4 ch; \
for (; data != end; data++) { \
ch = *data; \
MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
} \
break; \
} while (0)
/* calculate simple bloom-style bitmask for a given unicode string */
BLOOM_MASK mask;
mask = 0;
switch (kind) {
case PyUnicode_1BYTE_KIND:
BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
break;
case PyUnicode_2BYTE_KIND:
BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
break;
case PyUnicode_4BYTE_KIND:
BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
break;
default:
Py_UNREACHABLE();
}
return mask;
#undef BLOOM_UPDATE
}
static inline char _CPyStr_Equal_NoIdentCheck(PyObject *str1, PyObject *str2, Py_ssize_t str2_length) {
// This helper function only exists to deduplicate code in CPyStr_Equal and CPyStr_EqualLiteral
Py_ssize_t str1_length = PyUnicode_GET_LENGTH(str1);
if (str1_length != str2_length)
return 0;
int kind = PyUnicode_KIND(str1);
if (PyUnicode_KIND(str2) != kind)
return 0;
const void *data1 = PyUnicode_DATA(str1);
const void *data2 = PyUnicode_DATA(str2);
return memcmp(data1, data2, str1_length * kind) == 0;
}
// Adapted from CPython 3.13.1 (_PyUnicode_Equal)
char CPyStr_Equal(PyObject *str1, PyObject *str2) {
if (str1 == str2) {
return 1;
}
Py_ssize_t str2_length = PyUnicode_GET_LENGTH(str2);
return _CPyStr_Equal_NoIdentCheck(str1, str2, str2_length);
}
char CPyStr_EqualLiteral(PyObject *str, PyObject *literal_str, Py_ssize_t literal_length) {
if (str == literal_str) {
return 1;
}
return _CPyStr_Equal_NoIdentCheck(str, literal_str, literal_length);
}
PyObject *CPyStr_GetItem(PyObject *str, CPyTagged index) {
if (PyUnicode_READY(str) != -1) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyUnicode_GET_LENGTH(str);
if (n < 0)
n += size;
if (n < 0 || n >= size) {
PyErr_SetString(PyExc_IndexError, "string index out of range");
return NULL;
}
enum PyUnicode_Kind kind = (enum PyUnicode_Kind)PyUnicode_KIND(str);
void *data = PyUnicode_DATA(str);
Py_UCS4 ch = PyUnicode_READ(kind, data, n);
PyObject *unicode = PyUnicode_New(1, ch);
if (unicode == NULL)
return NULL;
if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
} else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
} else {
assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
PyUnicode_4BYTE_DATA(unicode)[0] = ch;
}
return unicode;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
} else {
PyObject *index_obj = CPyTagged_AsObject(index);
return PyObject_GetItem(str, index_obj);
}
}
PyObject *CPyStr_GetItemUnsafe(PyObject *str, Py_ssize_t index) {
// This is unsafe since we don't check for overflow when doing <<.
return CPyStr_GetItem(str, index << 1);
}
// A simplification of _PyUnicode_JoinArray() from CPython 3.9.6
PyObject *CPyStr_Build(Py_ssize_t len, ...) {
Py_ssize_t i;
va_list args;
// Calculate the total amount of space and check
// whether all components have the same kind.
Py_ssize_t sz = 0;
Py_UCS4 maxchar = 0;
int use_memcpy = 1; // Use memcpy by default
PyObject *last_obj = NULL;
va_start(args, len);
for (i = 0; i < len; i++) {
PyObject *item = va_arg(args, PyObject *);
if (!PyUnicode_Check(item)) {
PyErr_Format(PyExc_TypeError,
"sequence item %zd: expected str instance,"
" %.80s found",
i, Py_TYPE(item)->tp_name);
return NULL;
}
if (PyUnicode_READY(item) == -1)
return NULL;
size_t add_sz = PyUnicode_GET_LENGTH(item);
Py_UCS4 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
maxchar = Py_MAX(maxchar, item_maxchar);
// Using size_t to avoid overflow during arithmetic calculation
if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
PyErr_SetString(PyExc_OverflowError,
"join() result is too long for a Python string");
return NULL;
}
sz += add_sz;
// If these strings have different kind, we would call
// _PyUnicode_FastCopyCharacters() in the following part.
if (use_memcpy && last_obj != NULL) {
if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
use_memcpy = 0;
}
last_obj = item;
}
va_end(args);
// Construct the string
PyObject *res = PyUnicode_New(sz, maxchar);
if (res == NULL)
return NULL;
if (use_memcpy) {
unsigned char *res_data = PyUnicode_1BYTE_DATA(res);
unsigned int kind = PyUnicode_KIND(res);
va_start(args, len);
for (i = 0; i < len; ++i) {
PyObject *item = va_arg(args, PyObject *);
Py_ssize_t itemlen = PyUnicode_GET_LENGTH(item);
if (itemlen != 0) {
memcpy(res_data, PyUnicode_DATA(item), kind * itemlen);
res_data += kind * itemlen;
}
}
va_end(args);
assert(res_data == PyUnicode_1BYTE_DATA(res) + kind * PyUnicode_GET_LENGTH(res));
} else {
Py_ssize_t res_offset = 0;
va_start(args, len);
for (i = 0; i < len; ++i) {
PyObject *item = va_arg(args, PyObject *);
Py_ssize_t itemlen = PyUnicode_GET_LENGTH(item);
if (itemlen != 0) {
#if CPY_3_13_FEATURES
PyUnicode_CopyCharacters(res, res_offset, item, 0, itemlen);
#else
_PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
#endif
res_offset += itemlen;
}
}
va_end(args);
assert(res_offset == PyUnicode_GET_LENGTH(res));
}
#ifdef Py_DEBUG
assert(_PyUnicode_CheckConsistency(res, 1));
#endif
return res;
}
CPyTagged CPyStr_Find(PyObject *str, PyObject *substr, CPyTagged start, int direction) {
CPyTagged end = PyUnicode_GET_LENGTH(str) << 1;
return CPyStr_FindWithEnd(str, substr, start, end, direction);
}
CPyTagged CPyStr_FindWithEnd(PyObject *str, PyObject *substr, CPyTagged start, CPyTagged end, int direction) {
Py_ssize_t temp_start = CPyTagged_AsSsize_t(start);
if (temp_start == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return CPY_INT_TAG;
}
Py_ssize_t temp_end = CPyTagged_AsSsize_t(end);
if (temp_end == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return CPY_INT_TAG;
}
Py_ssize_t index = PyUnicode_Find(str, substr, temp_start, temp_end, direction);
if (unlikely(index == -2)) {
return CPY_INT_TAG;
}
return index << 1;
}
PyObject *CPyStr_Split(PyObject *str, PyObject *sep, CPyTagged max_split) {
Py_ssize_t temp_max_split = CPyTagged_AsSsize_t(max_split);
if (temp_max_split == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
return PyUnicode_Split(str, sep, temp_max_split);
}
PyObject *CPyStr_RSplit(PyObject *str, PyObject *sep, CPyTagged max_split) {
Py_ssize_t temp_max_split = CPyTagged_AsSsize_t(max_split);
if (temp_max_split == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
return PyUnicode_RSplit(str, sep, temp_max_split);
}
// This function has been copied from _PyUnicode_XStrip in cpython.git:Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
static PyObject *_PyStr_XStrip(PyObject *self, int striptype, PyObject *sepobj) {
const void *data;
int kind;
Py_ssize_t i, j, len;
BLOOM_MASK sepmask;
Py_ssize_t seplen;
// This check is needed from Python 3.9 and earlier.
if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
return NULL;
kind = PyUnicode_KIND(self);
data = PyUnicode_DATA(self);
len = PyUnicode_GET_LENGTH(self);
seplen = PyUnicode_GET_LENGTH(sepobj);
sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
PyUnicode_DATA(sepobj),
seplen);
i = 0;
if (striptype != RIGHTSTRIP) {
while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!BLOOM(sepmask, ch))
break;
if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
break;
i++;
}
}
j = len;
if (striptype != LEFTSTRIP) {
j--;
while (j >= i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, j);
if (!BLOOM(sepmask, ch))
break;
if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
break;
j--;
}
j++;
}
return PyUnicode_Substring(self, i, j);
}
// Copied from do_strip function in cpython.git/Objects/unicodeobject.c@0ef4ffeefd1737c18dc9326133c7894d58108c2e.
PyObject *_CPyStr_Strip(PyObject *self, int strip_type, PyObject *sep) {
if (sep == NULL || Py_IsNone(sep)) {
Py_ssize_t len, i, j;
// This check is needed from Python 3.9 and earlier.
if (PyUnicode_READY(self) == -1)
return NULL;
len = PyUnicode_GET_LENGTH(self);
if (PyUnicode_IS_ASCII(self)) {
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
i = 0;
if (strip_type != RIGHTSTRIP) {
while (i < len) {
Py_UCS1 ch = data[i];
if (!_Py_ascii_whitespace[ch])
break;
i++;
}
}
j = len;
if (strip_type != LEFTSTRIP) {
j--;
while (j >= i) {
Py_UCS1 ch = data[j];
if (!_Py_ascii_whitespace[ch])
break;
j--;
}
j++;
}
}
else {
int kind = PyUnicode_KIND(self);
const void *data = PyUnicode_DATA(self);
i = 0;
if (strip_type != RIGHTSTRIP) {
while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
if (!Py_UNICODE_ISSPACE(ch))
break;
i++;
}
}
j = len;
if (strip_type != LEFTSTRIP) {
j--;
while (j >= i) {
Py_UCS4 ch = PyUnicode_READ(kind, data, j);
if (!Py_UNICODE_ISSPACE(ch))
break;
j--;
}
j++;
}
}
return PyUnicode_Substring(self, i, j);
}
return _PyStr_XStrip(self, strip_type, sep);
}
PyObject *CPyStr_Replace(PyObject *str, PyObject *old_substr,
PyObject *new_substr, CPyTagged max_replace) {
Py_ssize_t temp_max_replace = CPyTagged_AsSsize_t(max_replace);
if (temp_max_replace == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
return PyUnicode_Replace(str, old_substr, new_substr, temp_max_replace);
}
int CPyStr_Startswith(PyObject *self, PyObject *subobj) {
Py_ssize_t start = 0;
Py_ssize_t end = PyUnicode_GET_LENGTH(self);
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
PyObject *substring = PyTuple_GET_ITEM(subobj, i);
if (!PyUnicode_Check(substring)) {
PyErr_Format(PyExc_TypeError,
"tuple for startswith must only contain str, "
"not %.100s",
Py_TYPE(substring)->tp_name);
return 2;
}
int result = PyUnicode_Tailmatch(self, substring, start, end, -1);
if (result) {
return 1;
}
}
return 0;
}
return PyUnicode_Tailmatch(self, subobj, start, end, -1);
}
int CPyStr_Endswith(PyObject *self, PyObject *subobj) {
Py_ssize_t start = 0;
Py_ssize_t end = PyUnicode_GET_LENGTH(self);
if (PyTuple_Check(subobj)) {
Py_ssize_t i;
for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
PyObject *substring = PyTuple_GET_ITEM(subobj, i);
if (!PyUnicode_Check(substring)) {
PyErr_Format(PyExc_TypeError,
"tuple for endswith must only contain str, "
"not %.100s",
Py_TYPE(substring)->tp_name);
return 2;
}
int result = PyUnicode_Tailmatch(self, substring, start, end, 1);
if (result) {
return 1;
}
}
return 0;
}
return PyUnicode_Tailmatch(self, subobj, start, end, 1);
}
PyObject *CPyStr_Removeprefix(PyObject *self, PyObject *prefix) {
Py_ssize_t end = PyUnicode_GET_LENGTH(self);
int match = PyUnicode_Tailmatch(self, prefix, 0, end, -1);
if (match) {
Py_ssize_t prefix_end = PyUnicode_GET_LENGTH(prefix);
return PyUnicode_Substring(self, prefix_end, end);
}
return Py_NewRef(self);
}
PyObject *CPyStr_Removesuffix(PyObject *self, PyObject *suffix) {
Py_ssize_t end = PyUnicode_GET_LENGTH(self);
int match = PyUnicode_Tailmatch(self, suffix, 0, end, 1);
if (match) {
Py_ssize_t suffix_end = PyUnicode_GET_LENGTH(suffix);
return PyUnicode_Substring(self, 0, end - suffix_end);
}
return Py_NewRef(self);
}
/* This does a dodgy attempt to append in place */
PyObject *CPyStr_Append(PyObject *o1, PyObject *o2) {
PyUnicode_Append(&o1, o2);
return o1;
}
PyObject *CPyStr_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
if (likely(PyUnicode_CheckExact(obj)
&& CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
if (startn < 0) {
startn += PyUnicode_GET_LENGTH(obj);
if (startn < 0) {
startn = 0;
}
}
if (endn < 0) {
endn += PyUnicode_GET_LENGTH(obj);
if (endn < 0) {
endn = 0;
}
}
return PyUnicode_Substring(obj, startn, endn);
}
return CPyObject_GetSlice(obj, start, end);
}
/* Check if the given string is true (i.e. its length isn't zero) */
bool CPyStr_IsTrue(PyObject *obj) {
Py_ssize_t length = PyUnicode_GET_LENGTH(obj);
return length != 0;
}
Py_ssize_t CPyStr_Size_size_t(PyObject *str) {
if (PyUnicode_READY(str) != -1) {
return PyUnicode_GET_LENGTH(str);
}
return -1;
}
PyObject *CPy_Decode(PyObject *obj, PyObject *encoding, PyObject *errors) {
const char *enc = NULL;
const char *err = NULL;
if (encoding) {
enc = PyUnicode_AsUTF8AndSize(encoding, NULL);
if (!enc) return NULL;
}
if (errors) {
err = PyUnicode_AsUTF8AndSize(errors, NULL);
if (!err) return NULL;
}
if (PyBytes_Check(obj)) {
return PyUnicode_Decode(((PyBytesObject *)obj)->ob_sval,
((PyVarObject *)obj)->ob_size,
enc, err);
} else {
return PyUnicode_FromEncodedObject(obj, enc, err);
}
}
PyObject *CPy_DecodeUTF8(PyObject *bytes) {
if (PyBytes_CheckExact(bytes)) {
char *buffer = PyBytes_AsString(bytes); // Borrowed reference
if (buffer == NULL) {
return NULL;
}
Py_ssize_t size = PyBytes_Size(bytes);
return PyUnicode_DecodeUTF8(buffer, size, "strict");
} else {
return PyUnicode_FromEncodedObject(bytes, "utf-8", "strict");
}
}
PyObject *CPy_DecodeASCII(PyObject *bytes) {
if (PyBytes_CheckExact(bytes)) {
char *buffer = PyBytes_AsString(bytes); // Borrowed reference
if (buffer == NULL) {
return NULL;
}
Py_ssize_t size = PyBytes_Size(bytes);
return PyUnicode_DecodeASCII(buffer, size, "strict");;
} else {
return PyUnicode_FromEncodedObject(bytes, "ascii", "strict");
}
}
PyObject *CPy_DecodeLatin1(PyObject *bytes) {
if (PyBytes_CheckExact(bytes)) {
char *buffer = PyBytes_AsString(bytes); // Borrowed reference
if (buffer == NULL) {
return NULL;
}
Py_ssize_t size = PyBytes_Size(bytes);
return PyUnicode_DecodeLatin1(buffer, size, "strict");
} else {
return PyUnicode_FromEncodedObject(bytes, "latin1", "strict");
}
}
PyObject *CPy_Encode(PyObject *obj, PyObject *encoding, PyObject *errors) {
const char *enc = NULL;
const char *err = NULL;
if (encoding) {
enc = PyUnicode_AsUTF8AndSize(encoding, NULL);
if (!enc) return NULL;
}
if (errors) {
err = PyUnicode_AsUTF8AndSize(errors, NULL);
if (!err) return NULL;
}
if (PyUnicode_Check(obj)) {
return PyUnicode_AsEncodedString(obj, enc, err);
} else {
PyErr_BadArgument();
return NULL;
}
}
Py_ssize_t CPyStr_Count(PyObject *unicode, PyObject *substring, CPyTagged start) {
Py_ssize_t temp_start = CPyTagged_AsSsize_t(start);
if (temp_start == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return -1;
}
Py_ssize_t end = PyUnicode_GET_LENGTH(unicode);
return PyUnicode_Count(unicode, substring, temp_start, end);
}
Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged start, CPyTagged end) {
Py_ssize_t temp_start = CPyTagged_AsSsize_t(start);
if (temp_start == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return -1;
}
Py_ssize_t temp_end = CPyTagged_AsSsize_t(end);
if (temp_end == -1 && PyErr_Occurred()) {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return -1;
}
return PyUnicode_Count(unicode, substring, temp_start, temp_end);
}
CPyTagged CPyStr_Ord(PyObject *obj) {
Py_ssize_t s = PyUnicode_GET_LENGTH(obj);
if (s == 1) {
int kind = PyUnicode_KIND(obj);
return PyUnicode_READ(kind, PyUnicode_DATA(obj), 0) << 1;
}
PyErr_Format(
PyExc_TypeError, "ord() expected a character, but a string of length %zd found", s);
return CPY_INT_TAG;
}

View File

@@ -0,0 +1,62 @@
// Tuple primitive operations
//
// These are registered in mypyc.primitives.tuple_ops.
#include <Python.h>
#include "CPy.h"
PyObject *CPySequenceTuple_GetItem(PyObject *tuple, CPyTagged index) {
if (CPyTagged_CheckShort(index)) {
Py_ssize_t n = CPyTagged_ShortAsSsize_t(index);
Py_ssize_t size = PyTuple_GET_SIZE(tuple);
if (n >= 0) {
if (n >= size) {
PyErr_SetString(PyExc_IndexError, "tuple index out of range");
return NULL;
}
} else {
n += size;
if (n < 0) {
PyErr_SetString(PyExc_IndexError, "tuple index out of range");
return NULL;
}
}
PyObject *result = PyTuple_GET_ITEM(tuple, n);
Py_INCREF(result);
return result;
} else {
PyErr_SetString(PyExc_OverflowError, CPYTHON_LARGE_INT_ERRMSG);
return NULL;
}
}
PyObject *CPySequenceTuple_GetSlice(PyObject *obj, CPyTagged start, CPyTagged end) {
if (likely(PyTuple_CheckExact(obj)
&& CPyTagged_CheckShort(start) && CPyTagged_CheckShort(end))) {
Py_ssize_t startn = CPyTagged_ShortAsSsize_t(start);
Py_ssize_t endn = CPyTagged_ShortAsSsize_t(end);
if (startn < 0) {
startn += PyTuple_GET_SIZE(obj);
}
if (endn < 0) {
endn += PyTuple_GET_SIZE(obj);
}
return PyTuple_GetSlice(obj, startn, endn);
}
return CPyObject_GetSlice(obj, start, end);
}
// No error checking
PyObject *CPySequenceTuple_GetItemUnsafe(PyObject *tuple, Py_ssize_t index)
{
PyObject *result = PyTuple_GET_ITEM(tuple, index);
Py_INCREF(result);
return result;
}
// PyTuple_SET_ITEM does no error checking,
// and should only be used to fill in brand new tuples.
void CPySequenceTuple_SetItemUnsafe(PyObject *tuple, Py_ssize_t index, PyObject *value)
{
PyTuple_SET_ITEM(tuple, index, value);
}