Send patches - preferably formatted by git format-patch - to patches at archlinux32 dot org.
summaryrefslogtreecommitdiff
path: root/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
diff options
context:
space:
mode:
Diffstat (limited to 'community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch')
-rw-r--r--community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch1704
1 files changed, 1704 insertions, 0 deletions
diff --git a/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch b/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
new file mode 100644
index 00000000..608c8224
--- /dev/null
+++ b/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
@@ -0,0 +1,1704 @@
+This fixes SIGILLs caused by SSE2 when using luajit
+
+Signed-off-by: Tasos Sahanidis <tasos@tasossah.com>
+---
+Sending v2 because git parsed the v1 patch as binary
+
+ community/luajit/PKGBUILD.i686 | 9 +
+ ...5e1a1b49871e645252bb12e722fb4879df11.patch | 1668 +++++++++++++++++
+ 2 files changed, 1677 insertions(+)
+ create mode 100644 community/luajit/PKGBUILD.i686
+ create mode 100644 community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
+
+diff --git a/community/luajit/PKGBUILD.i686 b/community/luajit/PKGBUILD.i686
+new file mode 100644
+index 00000000..8c266de6
+--- /dev/null
++++ b/community/luajit/PKGBUILD.i686
+@@ -0,0 +1,9 @@
++build() {
++ cd "luajit-2.0-${_commit::7}"
++ patch -p1 -i "$srcdir/c7815e1a1b49871e645252bb12e722fb4879df11.patch"
++}
++
++source+=(c7815e1a1b49871e645252bb12e722fb4879df11.patch)
++md5sums+=(67ce6dcf6eee2979688896c4016f8970)
++sha256sums+=(364e92a2ef79378d3340ba011e2c1be2d432c9396a77e4279be117e1bf567951)
++b2sums+=(22268efff79d793f806dfa52e8c23aba09879c79e83658024bd792d7463add3c7664f66b6981822d115bb990d95fcf5ce10c9be552ac3904897d39e4e4007ceb)
+diff --git a/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch b/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
+new file mode 100644
+index 00000000..37434173
+--- /dev/null
++++ b/community/luajit/c7815e1a1b49871e645252bb12e722fb4879df11.patch
+@@ -0,0 +1,1668 @@
++From c7815e1a1b49871e645252bb12e722fb4879df11 Mon Sep 17 00:00:00 2001
++From: Tasos Sahanidis <tasos@tasossah.com>
++Date: Mon, 30 Jan 2023 22:57:23 +0200
++Subject: [PATCH] Revert "x86: Remove x87 support from interpreter."
++
++This reverts commit 57768cd5882eb8d39c673d9dd8598946ef7c1843.
++JIT is disabled by default and untested
++---
++ src/Makefile | 13 +-
++ src/lib_jit.c | 44 ++-
++ src/lj_asm.c | 16 +
++ src/lj_jit.h | 18 +-
++ src/lj_vm.h | 3 +-
++ src/msvcbuild.bat | 1 -
++ src/vm_x86.dasc | 798 +++++++++++++++++++++++++++++++++++++++++-----
++ 7 files changed, 793 insertions(+), 100 deletions(-)
++
++diff --git a/src/Makefile b/src/Makefile
++index 30d64be2a..f226cc2db 100644
++--- a/src/Makefile
+++++ b/src/Makefile
++@@ -44,10 +44,13 @@ CCOPT= -O2 -fomit-frame-pointer
++ #
++ # Target-specific compiler options:
++ #
+++# x86 only: it's recommended to compile at least for i686. Better yet,
+++# compile for an architecture that has SSE2, too (-msse -msse2).
+++#
++ # x86/x64 only: For GCC 4.2 or higher and if you don't intend to distribute
++ # the binaries to a different machine you could also use: -march=native
++ #
++-CCOPT_x86= -march=i686 -msse -msse2 -mfpmath=sse
+++CCOPT_x86= -march=i686 -msse -mfpmath=sse
++ CCOPT_x64=
++ CCOPT_arm=
++ CCOPT_arm64=
++@@ -102,7 +105,7 @@ XCFLAGS=
++ #XCFLAGS+= -DLUAJIT_ENABLE_LUA52COMPAT
++ #
++ # Disable the JIT compiler, i.e. turn LuaJIT into a pure interpreter.
++-#XCFLAGS+= -DLUAJIT_DISABLE_JIT
+++XCFLAGS+= -DLUAJIT_DISABLE_JIT
++ #
++ # Some architectures (e.g. PPC) can use either single-number (1) or
++ # dual-number (2) mode. Uncomment one of these lines to override the
++@@ -437,6 +440,11 @@ DASM_AFLAGS+= -D VER=$(subst LJ_ARCH_VERSION_,,$(filter LJ_ARCH_VERSION_%,$(subs
++ ifeq (Windows,$(TARGET_SYS))
++ DASM_AFLAGS+= -D WIN
++ endif
+++ifeq (x86,$(TARGET_LJARCH))
+++ ifneq (,$(findstring __SSE2__ 1,$(TARGET_TESTARCH)))
+++ DASM_AFLAGS+= -D SSE
+++ endif
+++else
++ ifeq (x64,$(TARGET_LJARCH))
++ ifeq (,$(findstring LJ_FR2 1,$(TARGET_TESTARCH)))
++ DASM_ARCH= x86
++@@ -466,6 +474,7 @@ ifeq (ppc,$(TARGET_LJARCH))
++ endif
++ endif
++ endif
+++endif
++
++ DASM_FLAGS= $(DASM_XFLAGS) $(DASM_AFLAGS)
++ DASM_DASC= vm_$(DASM_ARCH).dasc
++diff --git a/src/lib_jit.c b/src/lib_jit.c
++index 2867d4206..2edecfcc2 100644
++--- a/src/lib_jit.c
+++++ b/src/lib_jit.c
++@@ -649,7 +649,7 @@ JIT_PARAMDEF(JIT_PARAMINIT)
++ #endif
++
++ /* Arch-dependent CPU feature detection. */
++-static uint32_t jit_cpudetect(void)
+++static uint32_t jit_cpudetect(lua_State *L)
++ {
++ uint32_t flags = 0;
++ #if LJ_TARGET_X86ORX64
++@@ -657,16 +657,45 @@ static uint32_t jit_cpudetect(void)
++ uint32_t vendor[4];
++ uint32_t features[4];
++ if (lj_vm_cpuid(0, vendor) && lj_vm_cpuid(1, features)) {
+++#if !LJ_HASJIT
+++#define JIT_F_CMOV 1
+++#define JIT_F_SSE2 2
+++#endif
+++ flags |= ((features[3] >> 15)&1) * JIT_F_CMOV;
+++ flags |= ((features[3] >> 26)&1) * JIT_F_SSE2;
+++#if LJ_HASJIT
++ flags |= ((features[2] >> 0)&1) * JIT_F_SSE3;
++ flags |= ((features[2] >> 19)&1) * JIT_F_SSE4_1;
+++ if (vendor[2] == 0x6c65746e) { /* Intel. */
+++ if ((features[0] & 0x0ff00f00) == 0x00000f00) /* P4. */
+++ flags |= JIT_F_P4; /* Currently unused. */
+++ else if ((features[0] & 0x0fff0ff0) == 0x000106c0) /* Atom. */
+++ flags |= JIT_F_LEA_AGU;
+++ } else if (vendor[2] == 0x444d4163) { /* AMD. */
+++ uint32_t fam = (features[0] & 0x0ff00f00);
+++ if (fam == 0x00000f00) /* K8. */
+++ flags |= JIT_F_SPLIT_XMM;
+++ if (fam >= 0x00000f00) /* K8, K10. */
+++ flags |= JIT_F_PREFER_IMUL;
+++ }
++ if (vendor[0] >= 7) {
++ uint32_t xfeatures[4];
++ lj_vm_cpuid(7, xfeatures);
++ flags |= ((xfeatures[1] >> 8)&1) * JIT_F_BMI2;
++ }
+++#endif
++ }
++- /* Don't bother checking for SSE2 -- the VM will crash before getting here. */
++-
+++ /* Check for required instruction set support on x86 (unnecessary on x64). */
+++#if LJ_TARGET_X86
+++#if !defined(LUAJIT_CPU_NOCMOV)
+++ if (!(flags & JIT_F_CMOV))
+++ luaL_error(L, "CPU not supported");
+++#endif
+++#if defined(LUAJIT_CPU_SSE2)
+++ if (!(flags & JIT_F_SSE2))
+++ luaL_error(L, "CPU does not support SSE2 (recompile without -DLUAJIT_CPU_SSE2)");
+++#endif
+++#endif
++ #elif LJ_TARGET_ARM
++
++ int ver = LJ_ARCH_VERSION; /* Compile-time ARM CPU detection. */
++@@ -729,7 +758,12 @@ static uint32_t jit_cpudetect(void)
++ static void jit_init(lua_State *L)
++ {
++ jit_State *J = L2J(L);
++- J->flags = jit_cpudetect() | JIT_F_ON | JIT_F_OPT_DEFAULT;
+++ uint32_t flags = jit_cpudetect(L);
+++#if LJ_TARGET_X86
+++ /* Silently turn off the JIT compiler on CPUs without SSE2. */
+++ if ((flags & JIT_F_SSE2))
+++#endif
+++ J->flags = flags | JIT_F_ON | JIT_F_OPT_DEFAULT;
++ memcpy(J->param, jit_param_default, sizeof(J->param));
++ lj_dispatch_update(G(L));
++ }
++@@ -738,7 +772,7 @@ static void jit_init(lua_State *L)
++ LUALIB_API int luaopen_jit(lua_State *L)
++ {
++ #if LJ_HASJIT
++- jit_init(L);
+++ jit_init(L); // FIXME should this be moved back to the bottom?
++ #endif
++ lua_pushliteral(L, LJ_OS_NAME);
++ lua_pushliteral(L, LJ_ARCH_NAME);
++diff --git a/src/lj_asm.c b/src/lj_asm.c
++index 6f5e0c45b..eda81f1e5 100644
++--- a/src/lj_asm.c
+++++ b/src/lj_asm.c
++@@ -2340,6 +2340,22 @@ static void asm_setup_regsp(ASMState *as)
++ }
++ break;
++ #endif
+++/*
+++ case IR_FPMATH:
+++#if LJ_TARGET_X86ORX64
+++ if (ir->op2 == IRFPM_EXP2) { // May be joined to lj_vm_pow_sse.
+++ ir->prev = REGSP_HINT(RID_XMM0);
+++#if !LJ_64
+++ if (as->evenspill < 4) // Leave room for 16 byte scratch area.
+++ as->evenspill = 4;
+++#endif
+++ if (inloop)
+++ as->modset |= RSET_RANGE(RID_XMM0, RID_XMM2+1)|RID2RSET(RID_EAX);
+++ continue;
+++ } else if (ir->op2 <= IRFPM_TRUNC && !(as->flags & JIT_F_SSE4_1)) {
+++ ir->prev = REGSP_HINT(RID_XMM0);
+++>>>>>>> parent of 57768cd5... x86: Remove x87 support from interpreter.
+++ */
++ case IR_FPMATH:
++ #if LJ_TARGET_X86ORX64
++ if (ir->op2 <= IRFPM_TRUNC) {
++diff --git a/src/lj_jit.h b/src/lj_jit.h
++index 7f081730e..85916b834 100644
++--- a/src/lj_jit.h
+++++ b/src/lj_jit.h
++@@ -20,12 +20,18 @@
++
++ #if LJ_TARGET_X86ORX64
++
++-#define JIT_F_SSE3 (JIT_F_CPU << 0)
++-#define JIT_F_SSE4_1 (JIT_F_CPU << 1)
++-#define JIT_F_BMI2 (JIT_F_CPU << 2)
++-
++-
++-#define JIT_F_CPUSTRING "\4SSE3\6SSE4.1\4BMI2"
+++#define JIT_F_CMOV (JIT_F_CPU << 0)
+++#define JIT_F_SSE2 (JIT_F_CPU << 1)
+++#define JIT_F_SSE3 (JIT_F_CPU << 2)
+++#define JIT_F_SSE4_1 (JIT_F_CPU << 3)
+++#define JIT_F_P4 (JIT_F_CPU << 4)
+++#define JIT_F_PREFER_IMUL (JIT_F_CPU << 5)
+++#define JIT_F_SPLIT_XMM (JIT_F_CPU << 6)
+++#define JIT_F_LEA_AGU (JIT_F_CPU << 7)
+++#define JIT_F_BMI2 (JIT_F_CPU << 8)
+++
+++
+++#define JIT_F_CPUSTRING "\4CMOV\4SSE2\4SSE3\6SSE4.1\2P4\3AMD\2K8\4ATOM\4BMI2"
++
++ #elif LJ_TARGET_ARM
++
++diff --git a/src/lj_vm.h b/src/lj_vm.h
++index c66db0049..9bc6d62fa 100644
++--- a/src/lj_vm.h
+++++ b/src/lj_vm.h
++@@ -58,7 +58,8 @@ LJ_ASMF void lj_vm_exit_handler(void);
++ LJ_ASMF void lj_vm_exit_interp(void);
++
++ /* Internal math helper functions. */
++-#if LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP)
+++// FIXME: is this correct?
+++#if LJ_TARGET_X86ORX64 || LJ_TARGET_PPC || LJ_TARGET_ARM64 || (LJ_TARGET_MIPS && LJ_ABI_SOFTFP)
++ #define lj_vm_floor floor
++ #define lj_vm_ceil ceil
++ #else
++diff --git a/src/msvcbuild.bat b/src/msvcbuild.bat
++index d323d8d44..67e53574d 100644
++--- a/src/msvcbuild.bat
+++++ b/src/msvcbuild.bat
++@@ -41,7 +41,6 @@ if exist minilua.exe.manifest^
++ @set DASC=vm_x86.dasc
++ @set DASMFLAGS=-D WIN -D JIT -D FFI
++ @set LJARCH=x86
++-@set LJCOMPILE=%LJCOMPILE% /arch:SSE2
++ :X64
++ @if "%1" neq "nogc64" goto :GC64
++ @shift
++diff --git a/src/vm_x86.dasc b/src/vm_x86.dasc
++index 18ca87b54..3efbba6cd 100644
++--- a/src/vm_x86.dasc
+++++ b/src/vm_x86.dasc
++@@ -18,6 +18,7 @@
++ |
++ |.if P64
++ |.define X64, 1
+++|.define SSE, 1
++ |.if WIN
++ |.define X64WIN, 1
++ |.endif
++@@ -439,6 +440,7 @@
++ | fpop
++ |.endmacro
++ |
+++|.macro fdup; fld st0; .endmacro
++ |.macro fpop1; fstp st1; .endmacro
++ |
++ |// Synthesize SSE FP constants.
++@@ -464,6 +466,9 @@
++ |.macro sseconst_1, reg, tmp // Synthesize 1.0.
++ | sseconst_hi reg, tmp, 3ff00000
++ |.endmacro
+++|.macro sseconst_m1, reg, tmp // Synthesize -1.0.
+++| sseconst_hi reg, tmp, bff00000
+++|.endmacro
++ |.macro sseconst_2p52, reg, tmp // Synthesize 2^52.
++ | sseconst_hi reg, tmp, 43300000
++ |.endmacro
++@@ -943,9 +948,13 @@ static void build_subroutines(BuildCtx *ctx)
++ |.if DUALNUM
++ | mov TMP2, LJ_TISNUM
++ | mov TMP1, RC
++- |.else
+++ |.elif SSE
++ | cvtsi2sd xmm0, RC
++ | movsd TMPQ, xmm0
+++ |.else
+++ | mov ARG4, RC
+++ | fild ARG4
+++ | fstp TMPQ
++ |.endif
++ | lea RCa, TMPQ // Store temp. TValue in TMPQ.
++ | jmp >1
++@@ -1031,9 +1040,13 @@ static void build_subroutines(BuildCtx *ctx)
++ |.if DUALNUM
++ | mov TMP2, LJ_TISNUM
++ | mov TMP1, RC
++- |.else
+++ |.elif SSE
++ | cvtsi2sd xmm0, RC
++ | movsd TMPQ, xmm0
+++ |.else
+++ | mov ARG4, RC
+++ | fild ARG4
+++ | fstp TMPQ
++ |.endif
++ | lea RCa, TMPQ // Store temp. TValue in TMPQ.
++ | jmp >1
++@@ -1416,6 +1429,19 @@ static void build_subroutines(BuildCtx *ctx)
++ | cmp NARGS:RD, 2+1; jb ->fff_fallback
++ |.endmacro
++ |
+++ |.macro .ffunc_n, name
+++ | .ffunc_1 name
+++ | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
+++ | fld qword [BASE]
+++ |.endmacro
+++ |
+++ |.macro .ffunc_n, name, op
+++ | .ffunc_1 name
+++ | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
+++ | op
+++ | fld qword [BASE]
+++ |.endmacro
+++ |
++ |.macro .ffunc_nsse, name, op
++ | .ffunc_1 name
++ | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
++@@ -1426,6 +1452,14 @@ static void build_subroutines(BuildCtx *ctx)
++ | .ffunc_nsse name, movsd
++ |.endmacro
++ |
+++ |.macro .ffunc_nn, name
+++ | .ffunc_2 name
+++ | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
+++ | cmp dword [BASE+12], LJ_TISNUM; jae ->fff_fallback
+++ | fld qword [BASE]
+++ | fld qword [BASE+8]
+++ |.endmacro
+++ |
++ |.macro .ffunc_nnsse, name
++ | .ffunc_2 name
++ | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
++@@ -1631,7 +1665,11 @@ static void build_subroutines(BuildCtx *ctx)
++ |.else
++ | jae ->fff_fallback
++ |.endif
+++ |.if SSE
++ | movsd xmm0, qword [BASE]; jmp ->fff_resxmm0
+++ |.else
+++ | fld qword [BASE]; jmp ->fff_resn
+++ |.endif
++ |
++ |.ffunc_1 tostring
++ | // Only handles the string or number case inline.
++@@ -1729,12 +1767,19 @@ static void build_subroutines(BuildCtx *ctx)
++ | add RD, 1
++ | mov dword [BASE-4], LJ_TISNUM
++ | mov dword [BASE-8], RD
++- |.else
+++ |.elif SSE
++ | movsd xmm0, qword [BASE+8]
++ | sseconst_1 xmm1, RBa
++ | addsd xmm0, xmm1
++ | cvttsd2si RD, xmm0
++ | movsd qword [BASE-8], xmm0
+++ |.else
+++ | fld qword [BASE+8]
+++ | fld1
+++ | faddp st1
+++ | fist ARG1
+++ | fstp qword [BASE-8]
+++ | mov RD, ARG1
++ |.endif
++ | mov TAB:RB, [BASE]
++ | cmp RD, TAB:RB->asize; jae >2 // Not in array part?
++@@ -1783,9 +1828,12 @@ static void build_subroutines(BuildCtx *ctx)
++ |.if DUALNUM
++ | mov dword [BASE+12], LJ_TISNUM
++ | mov dword [BASE+8], 0
++- |.else
+++ |.elif SSE
++ | xorps xmm0, xmm0
++ | movsd qword [BASE+8], xmm0
+++ |.else
+++ | fldz
+++ | fstp qword [BASE+8]
++ |.endif
++ | mov RD, 1+3
++ | jmp ->fff_res
++@@ -2017,11 +2065,6 @@ static void build_subroutines(BuildCtx *ctx)
++ |->fff_resi: // Dummy.
++ |.endif
++ |
++- |->fff_resn:
++- | mov PC, [BASE-4]
++- | fstp qword [BASE-8]
++- | jmp ->fff_res1
++- |
++ | .ffunc_1 math_abs
++ |.if DUALNUM
++ | cmp dword [BASE+4], LJ_TISNUM; jne >2
++@@ -2044,6 +2087,8 @@ static void build_subroutines(BuildCtx *ctx)
++ |.else
++ | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
++ |.endif
+++ |
+++ |.if SSE
++ | movsd xmm0, qword [BASE]
++ | sseconst_abs xmm1, RDa
++ | andps xmm0, xmm1
++@@ -2051,6 +2096,15 @@ static void build_subroutines(BuildCtx *ctx)
++ | mov PC, [BASE-4]
++ | movsd qword [BASE-8], xmm0
++ | // fallthrough
+++ |.else
+++ | fld qword [BASE]
+++ | fabs
+++ | // fallthrough
+++ |->fff_resxmm0: // Dummy.
+++ |->fff_resn:
+++ | mov PC, [BASE-4]
+++ | fstp qword [BASE-8]
+++ |.endif
++ |
++ |->fff_res1:
++ | mov RD, 1+1
++@@ -2093,8 +2147,9 @@ static void build_subroutines(BuildCtx *ctx)
++ |.else
++ | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
++ |.endif
+++ |.if SSE
++ | movsd xmm0, qword [BASE]
++- | call ->vm_ .. func .. _sse
+++ | call ->vm_ .. func
++ |.if DUALNUM
++ | cvttsd2si RB, xmm0
++ | cmp RB, 0x80000000
++@@ -2105,29 +2160,61 @@ static void build_subroutines(BuildCtx *ctx)
++ | je ->fff_resi
++ |.endif
++ | jmp ->fff_resxmm0
+++ |.else
+++ | fld qword [BASE]
+++ | call ->vm_ .. func
+++ | .if DUALNUM
+++ | fist ARG1
+++ | mov RB, ARG1
+++ | cmp RB, 0x80000000; jne >2
+++ | fdup
+++ | fild ARG1
+++ | fcomparepp
+++ | jp ->fff_resn
+++ | jne ->fff_resn
+++ |2:
+++ | fpop
+++ | jmp ->fff_resi
+++ | .else
+++ | jmp ->fff_resn
+++ | .endif
+++ |.endif
++ |.endmacro
++ |
++ | math_round floor
++ | math_round ceil
++ |
+++ |.if SSE
++ |.ffunc_nsse math_sqrt, sqrtsd; jmp ->fff_resxmm0
+++ |.else
+++ |.ffunc_n math_sqrt; fsqrt; jmp ->fff_resn
+++ |.endif
++ |
++ |.ffunc math_log
++ | cmp NARGS:RD, 1+1; jne ->fff_fallback // Exactly one argument.
++ | cmp dword [BASE+4], LJ_TISNUM; jae ->fff_fallback
+++ |.if SSE
++ | movsd xmm0, qword [BASE]
++- |.if not X64
++- | movsd FPARG1, xmm0
++- |.endif
+++ | .if not X64
+++ | movsd FPARG1, xmm0
+++ | .endif
++ | mov RB, BASE
++ | call extern log
++ | mov BASE, RB
++ | jmp ->fff_resfp
+++ |.else
+++ | fldln2; fld qword [BASE]; fyl2x; jmp ->fff_resn
+++ |.endif
++ |
++ |.macro math_extern, func
+++ |.if SSE
++ | .ffunc_nsse math_ .. func
++- |.if not X64
++- | movsd FPARG1, xmm0
+++ | .if not X64
+++ | movsd FPARG1, xmm0
+++ | .endif
+++ |.else
+++ | .ffunc_n math_ .. func
+++ | fstp FPARG1
++ |.endif
++ | mov RB, BASE
++ | call extern func
++@@ -2136,10 +2223,16 @@ static void build_subroutines(BuildCtx *ctx)
++ |.endmacro
++ |
++ |.macro math_extern2, func
++- | .ffunc_nnsse math_ .. func
++ |.if not X64
++- | movsd FPARG1, xmm0
++- | movsd FPARG3, xmm1
+++ | .if SSE
+++ | .ffunc_nnsse math_ .. func
+++ | movsd FPARG1, xmm0
+++ | movsd FPARG3, xmm1
+++ | .else
+++ | .ffunc_nn math_ .. func
+++ | fstp FPARG3
+++ | fstp FPARG1
+++ | .endif
++ |.endif
++ | mov RB, BASE
++ | call extern func
++@@ -2176,34 +2269,65 @@ static void build_subroutines(BuildCtx *ctx)
++ | cmp RB, 0x00200000; jb >4
++ |1:
++ | shr RB, 21; sub RB, RC // Extract and unbias exponent.
+++ |.if SSE
++ | cvtsi2sd xmm0, RB
+++ |.else
+++ | mov TMP1, RB; fild TMP1
+++ |.endif
++ | mov RB, [BASE-4]
++ | and RB, 0x800fffff // Mask off exponent.
++ | or RB, 0x3fe00000 // Put mantissa in range [0.5,1) or 0.
++ | mov [BASE-4], RB
++ |2:
+++ |.if SSE
++ | movsd qword [BASE], xmm0
+++ |.else
+++ | fstp qword [BASE]
+++ |.endif
++ | mov RD, 1+2
++ | jmp ->fff_res
++ |3: // Return +-0, +-Inf, NaN unmodified and an exponent of 0.
+++ |.if SSE
++ | xorps xmm0, xmm0; jmp <2
+++ |.else
+++ | fldz; jmp <2
+++ |.endif
++ |4: // Handle denormals by multiplying with 2^54 and adjusting the bias.
+++ |.if SSE
++ | movsd xmm0, qword [BASE]
++ | sseconst_hi xmm1, RBa, 43500000 // 2^54.
++ | mulsd xmm0, xmm1
++ | movsd qword [BASE-8], xmm0
+++ |.else
+++ | fld qword [BASE]
+++ | mov TMP1, 0x5a800000; fmul TMP1 // x = x*2^54
+++ | fstp qword [BASE-8]
+++ |.endif
++ | mov RB, [BASE-4]; mov RC, 1076; shl RB, 1; jmp <1
++ |
+++ |.if SSE
++ |.ffunc_nsse math_modf
+++ |.else
+++ |.ffunc_n math_modf
+++ |.endif
++ | mov RB, [BASE+4]
++ | mov PC, [BASE-4]
++ | shl RB, 1; cmp RB, 0xffe00000; je >4 // +-Inf?
+++ |.if SSE
++ | movaps xmm4, xmm0
++- | call ->vm_trunc_sse
+++ | call ->vm_trunc
++ | subsd xmm4, xmm0
++ |1:
++ | movsd qword [BASE-8], xmm0
++ | movsd qword [BASE], xmm4
+++ |.else
+++ | fdup
+++ | call ->vm_trunc
+++ | fsub st1, st0
+++ |1:
+++ | fstp qword [BASE-8]
+++ | fstp qword [BASE]
+++ |.endif
++ | mov RC, [BASE-4]; mov RB, [BASE+4]
++ | xor RC, RB; js >3 // Need to adjust sign?
++ |2:
++@@ -2213,9 +2337,24 @@ static void build_subroutines(BuildCtx *ctx)
++ | xor RB, 0x80000000; mov [BASE+4], RB // Flip sign of fraction.
++ | jmp <2
++ |4:
+++ |.if SSE
++ | xorps xmm4, xmm4; jmp <1 // Return +-Inf and +-0.
+++ |.else
+++ | fldz; fxch; jmp <1 // Return +-Inf and +-0.
+++ |.endif
+++ |
+++ |.ffunc_nnr math_fmod
+++ |1: ; fprem; fnstsw ax; sahf; jp <1
+++ | fpop1
+++ | jmp ->fff_resn
+++ |
+++ |.if SSE
+++ |.ffunc_nnsse math_pow; call ->vm_pow; jmp ->fff_resxmm0
+++ |.else
+++ |.ffunc_nn math_pow; call ->vm_pow; jmp ->fff_resn
+++ |.endif
++ |
++- |.macro math_minmax, name, cmovop, sseop
+++ |.macro math_minmax, name, cmovop, fcmovop, sseop
++ | .ffunc_1 name
++ | mov RA, 2
++ | cmp dword [BASE+4], LJ_TISNUM
++@@ -2232,7 +2371,12 @@ static void build_subroutines(BuildCtx *ctx)
++ |3:
++ | ja ->fff_fallback
++ | // Convert intermediate result to number and continue below.
+++ |.if SSE
++ | cvtsi2sd xmm0, RB
+++ |.else
+++ | mov TMP1, RB
+++ | fild TMP1
+++ |.endif
++ | jmp >6
++ |4:
++ | ja ->fff_fallback
++@@ -2240,6 +2384,7 @@ static void build_subroutines(BuildCtx *ctx)
++ | jae ->fff_fallback
++ |.endif
++ |
+++ |.if SSE
++ | movsd xmm0, qword [BASE]
++ |5: // Handle numbers or integers.
++ | cmp RA, RD; jae ->fff_resxmm0
++@@ -2258,10 +2403,34 @@ static void build_subroutines(BuildCtx *ctx)
++ | sseop xmm0, xmm1
++ | add RA, 1
++ | jmp <5
+++ |.else
+++ | fld qword [BASE]
+++ |5: // Handle numbers or integers.
+++ | cmp RA, RD; jae ->fff_resn
+++ | cmp dword [BASE+RA*8-4], LJ_TISNUM
+++ |.if DUALNUM
+++ | jb >6
+++ | ja >9
+++ | fild dword [BASE+RA*8-8]
+++ | jmp >7
+++ |.else
+++ | jae >9
+++ |.endif
+++ |6:
+++ | fld qword [BASE+RA*8-8]
+++ |7:
+++ | fucomi st1; fcmovop st1; fpop1
+++ | add RA, 1
+++ | jmp <5
+++ |.endif
++ |.endmacro
++ |
++- | math_minmax math_min, cmovg, minsd
++- | math_minmax math_max, cmovl, maxsd
+++ | math_minmax math_min, cmovg, fcmovnbe, minsd
+++ | math_minmax math_max, cmovl, fcmovbe, maxsd
+++ |.if not SSE
+++ |9:
+++ | fpop; jmp ->fff_fallback
+++ |.endif
++ |
++ |//-- String library -----------------------------------------------------
++ |
++@@ -2275,8 +2444,10 @@ static void build_subroutines(BuildCtx *ctx)
++ | movzx RB, byte STR:RB[1]
++ |.if DUALNUM
++ | jmp ->fff_resi
++- |.else
+++ |.elif SSE
++ | cvtsi2sd xmm0, RB; jmp ->fff_resxmm0
+++ |.else
+++ | mov TMP1, RB; fild TMP1; jmp ->fff_resn
++ |.endif
++ |
++ |.ffunc string_char // Only handle the 1-arg case here.
++@@ -2288,11 +2459,16 @@ static void build_subroutines(BuildCtx *ctx)
++ | mov RB, dword [BASE]
++ | cmp RB, 255; ja ->fff_fallback
++ | mov TMP2, RB
++- |.else
+++ |.elif SSE
++ | jae ->fff_fallback
++ | cvttsd2si RB, qword [BASE]
++ | cmp RB, 255; ja ->fff_fallback
++ | mov TMP2, RB
+++ |.else
+++ | jae ->fff_fallback
+++ | fld qword [BASE]
+++ | fistp TMP2
+++ | cmp TMP2, 255; ja ->fff_fallback
++ |.endif
++ |.if X64
++ | mov TMP3, 1
++@@ -2331,10 +2507,14 @@ static void build_subroutines(BuildCtx *ctx)
++ | jne ->fff_fallback
++ | mov RB, dword [BASE+16]
++ | mov TMP2, RB
++- |.else
+++ |.elif SSE
++ | jae ->fff_fallback
++ | cvttsd2si RB, qword [BASE+16]
++ | mov TMP2, RB
+++ |.else
+++ | jae ->fff_fallback
+++ | fld qword [BASE+16]
+++ | fistp TMP2
++ |.endif
++ |1:
++ | cmp dword [BASE+4], LJ_TSTR; jne ->fff_fallback
++@@ -2349,8 +2529,12 @@ static void build_subroutines(BuildCtx *ctx)
++ | mov RB, STR:RB->len
++ |.if DUALNUM
++ | mov RA, dword [BASE+8]
++- |.else
+++ |.elif SSE
++ | cvttsd2si RA, qword [BASE+8]
+++ |.else
+++ | fld qword [BASE+8]
+++ | fistp ARG3
+++ | mov RA, ARG3
++ |.endif
++ | mov RC, TMP2
++ | cmp RB, RC // len < end? (unsigned compare)
++@@ -2418,10 +2602,16 @@ static void build_subroutines(BuildCtx *ctx)
++ |
++ |//-- Bit library --------------------------------------------------------
++ |
+++ |.define TOBIT_BIAS, 0x59c00000 // 2^52 + 2^51 (float, not double!).
+++ |
++ |.macro .ffunc_bit, name, kind, fdef
++ | fdef name
++ |.if kind == 2
+++ |.if SSE
++ | sseconst_tobit xmm1, RBa
+++ |.else
+++ | mov TMP1, TOBIT_BIAS
+++ |.endif
++ |.endif
++ | cmp dword [BASE+4], LJ_TISNUM
++ |.if DUALNUM
++@@ -2437,12 +2627,24 @@ static void build_subroutines(BuildCtx *ctx)
++ |.else
++ | jae ->fff_fallback
++ |.endif
+++ |.if SSE
++ | movsd xmm0, qword [BASE]
++ |.if kind < 2
++ | sseconst_tobit xmm1, RBa
++ |.endif
++ | addsd xmm0, xmm1
++ | movd RB, xmm0
+++ |.else
+++ | fld qword [BASE]
+++ |.if kind < 2
+++ | mov TMP1, TOBIT_BIAS
+++ |.endif
+++ | fadd TMP1
+++ | fstp FPARG1
+++ |.if kind > 0
+++ | mov RB, ARG1
+++ |.endif
+++ |.endif
++ |2:
++ |.endmacro
++ |
++@@ -2451,7 +2653,15 @@ static void build_subroutines(BuildCtx *ctx)
++ |.endmacro
++ |
++ |.ffunc_bit bit_tobit, 0
+++ |.if DUALNUM or SSE
+++ |.if not SSE
+++ | mov RB, ARG1
+++ |.endif
++ | jmp ->fff_resbit
+++ |.else
+++ | fild ARG1
+++ | jmp ->fff_resn
+++ |.endif
++ |
++ |.macro .ffunc_bit_op, name, ins
++ | .ffunc_bit name, 2
++@@ -2471,10 +2681,17 @@ static void build_subroutines(BuildCtx *ctx)
++ |.else
++ | jae ->fff_fallback_bit_op
++ |.endif
+++ |.if SSE
++ | movsd xmm0, qword [RD]
++ | addsd xmm0, xmm1
++ | movd RA, xmm0
++ | ins RB, RA
+++ |.else
+++ | fld qword [RD]
+++ | fadd TMP1
+++ | fstp FPARG1
+++ | ins RB, ARG1
+++ |.endif
++ | sub RD, 8
++ | jmp <1
++ |.endmacro
++@@ -2491,10 +2708,15 @@ static void build_subroutines(BuildCtx *ctx)
++ | not RB
++ |.if DUALNUM
++ | jmp ->fff_resbit
++- |.else
+++ |.elif SSE
++ |->fff_resbit:
++ | cvtsi2sd xmm0, RB
++ | jmp ->fff_resxmm0
+++ |.else
+++ |->fff_resbit:
+++ | mov ARG1, RB
+++ | fild ARG1
+++ | jmp ->fff_resn
++ |.endif
++ |
++ |->fff_fallback_bit_op:
++@@ -2507,13 +2729,22 @@ static void build_subroutines(BuildCtx *ctx)
++ | // Note: no inline conversion from number for 2nd argument!
++ | cmp dword [BASE+12], LJ_TISNUM; jne ->fff_fallback
++ | mov RA, dword [BASE+8]
++- |.else
+++ |.elif SSE
++ | .ffunc_nnsse name
++ | sseconst_tobit xmm2, RBa
++ | addsd xmm0, xmm2
++ | addsd xmm1, xmm2
++ | movd RB, xmm0
++ | movd RA, xmm1
+++ |.else
+++ | .ffunc_nn name
+++ | mov TMP1, TOBIT_BIAS
+++ | fadd TMP1
+++ | fstp FPARG3
+++ | fadd TMP1
+++ | fstp FPARG1
+++ | mov RA, ARG3
+++ | mov RB, ARG1
++ |.endif
++ | ins RB, cl // Assumes RA is ecx.
++ | jmp ->fff_resbit
++@@ -2954,18 +3185,27 @@ static void build_subroutines(BuildCtx *ctx)
++ |//-----------------------------------------------------------------------
++ |
++ |// FP value rounding. Called by math.floor/math.ceil fast functions
++- |// and from JIT code. arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
++- |.macro vm_round, name, mode, cond
++- |->name:
++- |.if not X64 and cond
++- | movsd xmm0, qword [esp+4]
++- | call ->name .. _sse
++- | movsd qword [esp+4], xmm0 // Overwrite callee-owned arg.
++- | fld qword [esp+4]
+++ |// and from JIT code.
+++ |
+++ |// x87 variant: Arg/ret on x87 stack. No int/xmm registers modified.
+++ |.macro vm_round_x87, mode1, mode2
+++ | fnstcw word [esp+4] // Caveat: overwrites ARG1 and ARG2.
+++ | mov [esp+8], eax
+++ | mov ax, mode1
+++ | or ax, [esp+4]
+++ |.if mode2 ~= 0xffff
+++ | and ax, mode2
+++ |.endif
+++ | mov [esp+6], ax
+++ | fldcw word [esp+6]
+++ | frndint
+++ | fldcw word [esp+4]
+++ | mov eax, [esp+8]
++ | ret
++- |.endif
+++ |.endmacro
++ |
++- |->name .. _sse:
+++ |// SSE variant: arg/ret is xmm0. xmm0-xmm3 and RD (eax) modified.
+++ |.macro vm_round_sse, mode
++ | sseconst_abs xmm2, RDa
++ | sseconst_2p52 xmm3, RDa
++ | movaps xmm1, xmm0
++@@ -2986,29 +3226,37 @@ static void build_subroutines(BuildCtx *ctx)
++ | addsd xmm1, xmm3 // (|x| + 2^52) - 2^52
++ | subsd xmm1, xmm3
++ | orpd xmm1, xmm2 // Merge sign bit back in.
++- | sseconst_1 xmm3, RDa
++ | .if mode == 1 // ceil(x)?
+++ | sseconst_m1 xmm2, RDa // Must subtract -1 to preserve -0.
++ | cmpsd xmm0, xmm1, 6 // x > result?
++- | andpd xmm0, xmm3
++- | addsd xmm1, xmm0 // If yes, add 1.
++- | orpd xmm1, xmm2 // Merge sign bit back in (again).
++ | .else // floor(x)?
+++ | sseconst_1 xmm2, RDa
++ | cmpsd xmm0, xmm1, 1 // x < result?
++- | andpd xmm0, xmm3
++- | subsd xmm1, xmm0 // If yes, subtract 1.
++ | .endif
+++ | andpd xmm0, xmm2
+++ | subsd xmm1, xmm0 // If yes, subtract +-1.
++ |.endif
++ | movaps xmm0, xmm1
++ |1:
++ | ret
++ |.endmacro
++ |
++- | vm_round vm_floor, 0, 1
++- | vm_round vm_ceil, 1, JIT
++- | vm_round vm_trunc, 2, JIT
+++ |.macro vm_round, name, ssemode, mode1, mode2, extra // FIXME: EXTRA NOT USED
+++ |->name:
+++ |.if not SSE
+++ | vm_round_x87 mode1, mode2
+++ |.endif
+++ |->name .. _sse:
+++ | vm_round_sse ssemode
+++ |.endmacro
+++ |
+++ | vm_round vm_floor, 0, 0x0400, 0xf7ff, 1
+++ | vm_round vm_ceil, 1, 0x0800, 0xfbff, JIT
+++ | vm_round vm_trunc, 2, 0x0c00, 0xffff, JIT
++ |
++ |// FP modulo x%y. Called by BC_MOD* and vm_arith.
++ |->vm_mod:
+++ |.if SSE
++ |// Args in xmm0/xmm1, return value in xmm0.
++ |// Caveat: xmm0-xmm5 and RC (eax) modified!
++ | movaps xmm5, xmm0
++@@ -3036,6 +3284,243 @@ static void build_subroutines(BuildCtx *ctx)
++ | movaps xmm0, xmm5
++ | subsd xmm0, xmm1
++ | ret
+++ |.else
+++ |// Args/ret on x87 stack (y on top). No xmm registers modified.
+++ |// Caveat: needs 3 slots on x87 stack! RC (eax) modified!
+++ | fld st1
+++ | fdiv st1
+++ | fnstcw word [esp+4]
+++ | mov ax, 0x0400
+++ | or ax, [esp+4]
+++ | and ax, 0xf7ff
+++ | mov [esp+6], ax
+++ | fldcw word [esp+6]
+++ | frndint
+++ | fldcw word [esp+4]
+++ | fmulp st1
+++ | fsubp st1
+++ | ret
+++ |.endif
+++ |
+++ |->vm_exp2raw: // Entry point for vm_pow. Without +-Inf check.
+++ | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part.
+++ | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int
+++ |1:
+++ | ret
+++ |2:
+++ | fpop; fldz; ret
+++ |
+++ |// Generic power function x^y. Called by BC_POW, math.pow fast function,
+++ |// and vm_arith.
+++ |// Args/ret on x87 stack (y on top). RC (eax) modified.
+++ |// Caveat: needs 3 slots on x87 stack!
+++ |->vm_pow:
+++ |.if not SSE
+++ | fist dword [esp+4] // Store/reload int before comparison.
+++ | fild dword [esp+4] // Integral exponent used in vm_powi.
+++ | fucomip st1
+++ | jnz >8 // Branch for FP exponents.
+++ | jp >9 // Branch for NaN exponent.
+++ | fpop // Pop y and fallthrough to vm_powi.
+++ |
+++ |// FP/int power function x^i. Arg1/ret on x87 stack.
+++ |// Arg2 (int) on C stack. RC (eax) modified.
+++ |// Caveat: needs 2 slots on x87 stack!
+++ | mov eax, [esp+4]
+++ | cmp eax, 1; jle >6 // i<=1?
+++ | // Now 1 < (unsigned)i <= 0x80000000.
+++ |1: // Handle leading zeros.
+++ | test eax, 1; jnz >2
+++ | fmul st0
+++ | shr eax, 1
+++ | jmp <1
+++ |2:
+++ | shr eax, 1; jz >5
+++ | fdup
+++ |3: // Handle trailing bits.
+++ | fmul st0
+++ | shr eax, 1; jz >4
+++ | jnc <3
+++ | fmul st1, st0
+++ | jmp <3
+++ |4:
+++ | fmulp st1
+++ |5:
+++ | ret
+++ |6:
+++ | je <5 // x^1 ==> x
+++ | jb >7
+++ | fld1; fdivrp st1
+++ | neg eax
+++ | cmp eax, 1; je <5 // x^-1 ==> 1/x
+++ | jmp <1 // x^-i ==> (1/x)^i
+++ |7:
+++ | fpop; fld1 // x^0 ==> 1
+++ | ret
+++ |
+++ |8: // FP/FP power function x^y.
+++ | fst dword [esp+4]
+++ | fxch
+++ | fst dword [esp+8]
+++ | mov eax, [esp+4]; shl eax, 1
+++ | cmp eax, 0xff000000; je >2 // x^+-Inf?
+++ | mov eax, [esp+8]; shl eax, 1; je >4 // +-0^y?
+++ | cmp eax, 0xff000000; je >4 // +-Inf^y?
+++ | fyl2x
+++ | jmp ->vm_exp2raw
+++ |
+++ |9: // Handle x^NaN.
+++ | fld1
+++ | fucomip st2
+++ | je >1 // 1^NaN ==> 1
+++ | fxch // x^NaN ==> NaN
+++ |1:
+++ | fpop
+++ | ret
+++ |
+++ |2: // Handle x^+-Inf.
+++ | fabs
+++ | fld1
+++ | fucomip st1
+++ | je >3 // +-1^+-Inf ==> 1
+++ | fpop; fabs; fldz; mov eax, 0; setc al
+++ | ror eax, 1; xor eax, [esp+4]; jns >3 // |x|<>1, x^+-Inf ==> +Inf/0
+++ | fxch
+++ |3:
+++ | fpop1; fabs
+++ | ret
+++ |
+++ |4: // Handle +-0^y or +-Inf^y.
+++ | cmp dword [esp+4], 0; jge <3 // y >= 0, x^y ==> |x|
+++ | fpop; fpop
+++ | test eax, eax; jz >5 // y < 0, +-0^y ==> +Inf
+++ | fldz // y < 0, +-Inf^y ==> 0
+++ | ret
+++ |5:
+++ | mov dword [esp+4], 0x7f800000 // Return +Inf.
+++ | fld dword [esp+4]
+++ | ret
+++ |.endif
+++ |
+++ |// Args in xmm0/xmm1. Ret in xmm0. xmm0-xmm2 and RC (eax) modified.
+++ |// Needs 16 byte scratch area for x86. Also called from JIT code.
+++ |->vm_pow_sse:
+++ | cvtsd2si eax, xmm1
+++ | cvtsi2sd xmm2, eax
+++ | ucomisd xmm1, xmm2
+++ | jnz >8 // Branch for FP exponents.
+++ | jp >9 // Branch for NaN exponent.
+++ | // Fallthrough to vm_powi_sse.
+++ |
+++ |// Args in xmm0/eax. Ret in xmm0. xmm0-xmm1 and eax modified.
+++ |->vm_powi_sse:
+++ | cmp eax, 1; jle >6 // i<=1?
+++ | // Now 1 < (unsigned)i <= 0x80000000.
+++ |1: // Handle leading zeros.
+++ | test eax, 1; jnz >2
+++ | mulsd xmm0, xmm0
+++ | shr eax, 1
+++ | jmp <1
+++ |2:
+++ | shr eax, 1; jz >5
+++ | movaps xmm1, xmm0
+++ |3: // Handle trailing bits.
+++ | mulsd xmm0, xmm0
+++ | shr eax, 1; jz >4
+++ | jnc <3
+++ | mulsd xmm1, xmm0
+++ | jmp <3
+++ |4:
+++ | mulsd xmm0, xmm1
+++ |5:
+++ | ret
+++ |6:
+++ | je <5 // x^1 ==> x
+++ | jb >7 // x^0 ==> 1
+++ | neg eax
+++ | call <1
+++ | sseconst_1 xmm1, RDa
+++ | divsd xmm1, xmm0
+++ | movaps xmm0, xmm1
+++ | ret
+++ |7:
+++ | sseconst_1 xmm0, RDa
+++ | ret
+++ |
+++ |8: // FP/FP power function x^y.
+++ |.if X64
+++ | movd rax, xmm1; shl rax, 1
+++ | rol rax, 12; cmp rax, 0xffe; je >2 // x^+-Inf?
+++ | movd rax, xmm0; shl rax, 1; je >4 // +-0^y?
+++ | rol rax, 12; cmp rax, 0xffe; je >5 // +-Inf^y?
+++ | .if X64WIN
+++ | movsd qword [rsp+16], xmm1 // Use scratch area.
+++ | movsd qword [rsp+8], xmm0
+++ | fld qword [rsp+16]
+++ | fld qword [rsp+8]
+++ | .else
+++ | movsd qword [rsp-16], xmm1 // Use red zone.
+++ | movsd qword [rsp-8], xmm0
+++ | fld qword [rsp-16]
+++ | fld qword [rsp-8]
+++ | .endif
+++ |.else
+++ | movsd qword [esp+12], xmm1 // Needs 16 byte scratch area.
+++ | movsd qword [esp+4], xmm0
+++ | cmp dword [esp+12], 0; jne >1
+++ | mov eax, [esp+16]; shl eax, 1
+++ | cmp eax, 0xffe00000; je >2 // x^+-Inf?
+++ |1:
+++ | cmp dword [esp+4], 0; jne >1
+++ | mov eax, [esp+8]; shl eax, 1; je >4 // +-0^y?
+++ | cmp eax, 0xffe00000; je >5 // +-Inf^y?
+++ |1:
+++ | fld qword [esp+12]
+++ | fld qword [esp+4]
+++ |.endif
+++ | fyl2x // y*log2(x)
+++ | fdup; frndint; fsub st1, st0; fxch // Split into frac/int part.
+++ | f2xm1; fld1; faddp st1; fscale; fpop1 // ==> (2^frac-1 +1) << int
+++ |.if X64WIN
+++ | fstp qword [rsp+8] // Use scratch area.
+++ | movsd xmm0, qword [rsp+8]
+++ |.elif X64
+++ | fstp qword [rsp-8] // Use red zone.
+++ | movsd xmm0, qword [rsp-8]
+++ |.else
+++ | fstp qword [esp+4] // Needs 8 byte scratch area.
+++ | movsd xmm0, qword [esp+4]
+++ |.endif
+++ | ret
+++ |
+++ |9: // Handle x^NaN.
+++ | sseconst_1 xmm2, RDa
+++ | ucomisd xmm0, xmm2; je >1 // 1^NaN ==> 1
+++ | movaps xmm0, xmm1 // x^NaN ==> NaN
+++ |1:
+++ | ret
+++ |
+++ |2: // Handle x^+-Inf.
+++ | sseconst_abs xmm2, RDa
+++ | andpd xmm0, xmm2 // |x|
+++ | sseconst_1 xmm2, RDa
+++ | ucomisd xmm0, xmm2; je <1 // +-1^+-Inf ==> 1
+++ | movmskpd eax, xmm1
+++ | xorps xmm0, xmm0
+++ | mov ah, al; setc al; xor al, ah; jne <1 // |x|<>1, x^+-Inf ==> +Inf/0
+++ |3:
+++ | sseconst_hi xmm0, RDa, 7ff00000 // +Inf
+++ | ret
+++ |
+++ |4: // Handle +-0^y.
+++ | movmskpd eax, xmm1; test eax, eax; jnz <3 // y < 0, +-0^y ==> +Inf
+++ | xorps xmm0, xmm0 // y >= 0, +-0^y ==> 0
+++ | ret
+++ |
+++ |5: // Handle +-Inf^y.
+++ | movmskpd eax, xmm1; test eax, eax; jz <3 // y >= 0, +-Inf^y ==> +Inf
+++ | xorps xmm0, xmm0 // y < 0, +-Inf^y ==> 0
+++ | ret
++ |
++ |//-----------------------------------------------------------------------
++ |//-- Miscellaneous functions --------------------------------------------
++@@ -3429,12 +3914,19 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | // RA is a number.
++ | cmp dword [BASE+RD*8+4], LJ_TISNUM; jb >1; jne ->vmeta_comp
++ | // RA is a number, RD is an integer.
+++ |.if SSE
++ | cvtsi2sd xmm0, dword [BASE+RD*8]
++ | jmp >2
+++ |.else
+++ | fld qword [BASE+RA*8]
+++ | fild dword [BASE+RD*8]
+++ | jmp >3
+++ |.endif
++ |
++ |8: // RA is an integer, RD is not an integer.
++ | ja ->vmeta_comp
++ | // RA is an integer, RD is a number.
+++ |.if SSE
++ | cvtsi2sd xmm1, dword [BASE+RA*8]
++ | movsd xmm0, qword [BASE+RD*8]
++ | add PC, 4
++@@ -3442,15 +3934,29 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | jmp_comp jbe, ja, jb, jae, <9
++ | jmp <6
++ |.else
+++ | fild dword [BASE+RA*8]
+++ | jmp >2
+++ |.endif
+++ |.else
++ | checknum RA, ->vmeta_comp
++ | checknum RD, ->vmeta_comp
++ |.endif
+++ |.if SSE
++ |1:
++ | movsd xmm0, qword [BASE+RD*8]
++ |2:
++ | add PC, 4
++ | ucomisd xmm0, qword [BASE+RA*8]
++ |3:
+++ |.else
+++ |1:
+++ | fld qword [BASE+RA*8] // Reverse order, i.e like cmp D, A.
+++ |2:
+++ | fld qword [BASE+RD*8]
+++ |3:
+++ | add PC, 4
+++ | fcomparepp
+++ |.endif
++ | // Unordered: all of ZF CF PF set, ordered: PF clear.
++ | // To preserve NaN semantics GE/GT branch on unordered, but LT/LE don't.
++ |.if DUALNUM
++@@ -3490,25 +3996,43 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | // RD is a number.
++ | cmp dword [BASE+RA*8+4], LJ_TISNUM; jb >1; jne >5
++ | // RD is a number, RA is an integer.
+++ |.if SSE
++ | cvtsi2sd xmm0, dword [BASE+RA*8]
+++ |.else
+++ | fild dword [BASE+RA*8]
+++ |.endif
++ | jmp >2
++ |
++ |8: // RD is an integer, RA is not an integer.
++ | ja >5
++ | // RD is an integer, RA is a number.
+++ |.if SSE
++ | cvtsi2sd xmm0, dword [BASE+RD*8]
++ | ucomisd xmm0, qword [BASE+RA*8]
+++ |.else
+++ | fild dword [BASE+RD*8]
+++ | fld qword [BASE+RA*8]
+++ |.endif
++ | jmp >4
++ |
++ |.else
++ | cmp RB, LJ_TISNUM; jae >5
++ | checknum RA, >5
++ |.endif
+++ |.if SSE
++ |1:
++ | movsd xmm0, qword [BASE+RA*8]
++ |2:
++ | ucomisd xmm0, qword [BASE+RD*8]
++ |4:
+++ |.else
+++ |1:
+++ | fld qword [BASE+RA*8]
+++ |2:
+++ | fld qword [BASE+RD*8]
+++ |4:
+++ | fcomparepp
+++ |.endif
++ iseqne_fp:
++ if (vk) {
++ | jp >2 // Unordered means not equal.
++@@ -3631,21 +4155,39 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | // RA is a number.
++ | cmp dword [KBASE+RD*8+4], LJ_TISNUM; jb >1
++ | // RA is a number, RD is an integer.
+++ |.if SSE
++ | cvtsi2sd xmm0, dword [KBASE+RD*8]
+++ |.else
+++ | fild dword [KBASE+RD*8]
+++ |.endif
++ | jmp >2
++ |
++ |8: // RA is an integer, RD is a number.
+++ |.if SSE
++ | cvtsi2sd xmm0, dword [BASE+RA*8]
++ | ucomisd xmm0, qword [KBASE+RD*8]
+++ |.else
+++ | fild dword [BASE+RA*8]
+++ | fld qword [KBASE+RD*8]
+++ |.endif
++ | jmp >4
++ |.else
++ | cmp RB, LJ_TISNUM; jae >3
++ |.endif
+++ |.if SSE
++ |1:
++ | movsd xmm0, qword [KBASE+RD*8]
++ |2:
++ | ucomisd xmm0, qword [BASE+RA*8]
++ |4:
+++ |.else
+++ |1:
+++ | fld qword [KBASE+RD*8]
+++ |2:
+++ | fld qword [BASE+RA*8]
+++ |4:
+++ | fcomparepp
+++ |.endif
++ goto iseqne_fp;
++ case BC_ISEQP: case BC_ISNEP:
++ vk = op == BC_ISEQP;
++@@ -3751,10 +4293,16 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ |.else
++ | checknum RD, ->vmeta_unm
++ |.endif
+++ |.if SSE
++ | movsd xmm0, qword [BASE+RD*8]
++ | sseconst_sign xmm1, RDa
++ | xorps xmm0, xmm1
++ | movsd qword [BASE+RA*8], xmm0
+++ |.else
+++ | fld qword [BASE+RD*8]
+++ | fchs
+++ | fstp qword [BASE+RA*8]
+++ |.endif
++ |.if DUALNUM
++ | jmp <9
++ |.else
++@@ -3770,11 +4318,15 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ |1:
++ | mov dword [BASE+RA*8+4], LJ_TISNUM
++ | mov dword [BASE+RA*8], RD
++- |.else
+++ |.elif SSE
++ | xorps xmm0, xmm0
++ | cvtsi2sd xmm0, dword STR:RD->len
++ |1:
++ | movsd qword [BASE+RA*8], xmm0
+++ |.else
+++ | fild dword STR:RD->len
+++ |1:
+++ | fstp qword [BASE+RA*8]
++ |.endif
++ | ins_next
++ |2:
++@@ -3792,8 +4344,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | // Length of table returned in eax (RD).
++ |.if DUALNUM
++ | // Nothing to do.
++- |.else
+++ |.elif SSE
++ | cvtsi2sd xmm0, RD
+++ |.else
+++ | mov ARG1, RD
+++ | fild ARG1
++ |.endif
++ | mov BASE, RB // Restore BASE.
++ | movzx RA, PC_RA
++@@ -3808,7 +4363,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++
++ /* -- Binary ops -------------------------------------------------------- */
++
++- |.macro ins_arithpre, sseins, ssereg
+++ |.macro ins_arithpre, x87ins, sseins, ssereg
++ | ins_ABC
++ ||vk = ((int)op - BC_ADDVN) / (BC_ADDNV-BC_ADDVN);
++ ||switch (vk) {
++@@ -3817,22 +4372,37 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | .if DUALNUM
++ | cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_vn
++ | .endif
++- | movsd xmm0, qword [BASE+RB*8]
++- | sseins ssereg, qword [KBASE+RC*8]
+++ | .if SSE
+++ | movsd xmm0, qword [BASE+RB*8]
+++ | sseins ssereg, qword [KBASE+RC*8]
+++ | .else
+++ | fld qword [BASE+RB*8]
+++ | x87ins qword [KBASE+RC*8]
+++ | .endif
++ || break;
++ ||case 1:
++ | checknum RB, ->vmeta_arith_nv
++ | .if DUALNUM
++ | cmp dword [KBASE+RC*8+4], LJ_TISNUM; jae ->vmeta_arith_nv
++ | .endif
++- | movsd xmm0, qword [KBASE+RC*8]
++- | sseins ssereg, qword [BASE+RB*8]
+++ | .if SSE
+++ | movsd xmm0, qword [KBASE+RC*8]
+++ | sseins ssereg, qword [BASE+RB*8]
+++ | .else
+++ | fld qword [KBASE+RC*8]
+++ | x87ins qword [BASE+RB*8]
+++ | .endif
++ || break;
++ ||default:
++ | checknum RB, ->vmeta_arith_vv
++ | checknum RC, ->vmeta_arith_vv
++- | movsd xmm0, qword [BASE+RB*8]
++- | sseins ssereg, qword [BASE+RC*8]
+++ | .if SSE
+++ | movsd xmm0, qword [BASE+RB*8]
+++ | sseins ssereg, qword [BASE+RC*8]
+++ | .else
+++ | fld qword [BASE+RB*8]
+++ | x87ins qword [BASE+RC*8]
+++ | .endif
++ || break;
++ ||}
++ |.endmacro
++@@ -3870,62 +4440,55 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ |.endmacro
++ |
++ |.macro ins_arithpost
+++ |.if SSE
++ | movsd qword [BASE+RA*8], xmm0
+++ |.else
+++ | fstp qword [BASE+RA*8]
+++ |.endif
++ |.endmacro
++ |
++- |.macro ins_arith, sseins
++- | ins_arithpre sseins, xmm0
+++ |.macro ins_arith, x87ins, sseins
+++ | ins_arithpre x87ins, sseins, xmm0
++ | ins_arithpost
++ | ins_next
++ |.endmacro
++ |
++- |.macro ins_arith, intins, sseins
+++ |.macro ins_arith, intins, x87ins, sseins
++ |.if DUALNUM
++ | ins_arithdn intins
++ |.else
++- | ins_arith, sseins
+++ | ins_arith, x87ins, sseins
++ |.endif
++ |.endmacro
++
++ | // RA = dst, RB = src1 or num const, RC = src2 or num const
++ case BC_ADDVN: case BC_ADDNV: case BC_ADDVV:
++- | ins_arith add, addsd
+++ | ins_arith add, fadd, addsd
++ break;
++ case BC_SUBVN: case BC_SUBNV: case BC_SUBVV:
++- | ins_arith sub, subsd
+++ | ins_arith sub, fsub, subsd
++ break;
++ case BC_MULVN: case BC_MULNV: case BC_MULVV:
++- | ins_arith imul, mulsd
+++ | ins_arith imul, fmul, mulsd
++ break;
++ case BC_DIVVN: case BC_DIVNV: case BC_DIVVV:
++- | ins_arith divsd
+++ | ins_arith fdiv, divsd
++ break;
++ case BC_MODVN:
++- | ins_arithpre movsd, xmm1
+++ | ins_arithpre fld, movsd, xmm1
++ |->BC_MODVN_Z:
++ | call ->vm_mod
++ | ins_arithpost
++ | ins_next
++ break;
++ case BC_MODNV: case BC_MODVV:
++- | ins_arithpre movsd, xmm1
+++ | ins_arithpre fld, movsd, xmm1
++ | jmp ->BC_MODVN_Z // Avoid 3 copies. It's slow anyway.
++ break;
++ case BC_POW:
++- | ins_arithpre movsd, xmm1
++- | mov RB, BASE
++- |.if not X64
++- | movsd FPARG1, xmm0
++- | movsd FPARG3, xmm1
++- |.endif
++- | call extern pow
++- | movzx RA, PC_RA
++- | mov BASE, RB
++- |.if X64
+++ | ins_arithpre fld, movsd, xmm1 // FIXME: THIS SHOULD NOT BE FLD. Whole thing is broken
+++ | call ->vm_pow
++ | ins_arithpost
++- |.else
++- | fstp qword [BASE+RA*8]
++- |.endif
++ | ins_next
++ break;
++
++@@ -3993,17 +4556,25 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | movsx RD, RDW
++ | mov dword [BASE+RA*8+4], LJ_TISNUM
++ | mov dword [BASE+RA*8], RD
++- |.else
+++ |.elif SSE
++ | movsx RD, RDW // Sign-extend literal.
++ | cvtsi2sd xmm0, RD
++ | movsd qword [BASE+RA*8], xmm0
+++ |.else
+++ | fild PC_RD // Refetch signed RD from instruction.
+++ | fstp qword [BASE+RA*8]
++ |.endif
++ | ins_next
++ break;
++ case BC_KNUM:
++ | ins_AD // RA = dst, RD = num const
+++ |.if SSE
++ | movsd xmm0, qword [KBASE+RD*8]
++ | movsd qword [BASE+RA*8], xmm0
+++ |.else
+++ | fld qword [KBASE+RD*8]
+++ | fstp qword [BASE+RA*8]
+++ |.endif
++ | ins_next
++ break;
++ case BC_KPRI:
++@@ -4110,10 +4681,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ case BC_USETN:
++ | ins_AD // RA = upvalue #, RD = num const
++ | mov LFUNC:RB, [BASE-8]
+++ |.if SSE
++ | movsd xmm0, qword [KBASE+RD*8]
+++ |.else
+++ | fld qword [KBASE+RD*8]
+++ |.endif
++ | mov UPVAL:RB, [LFUNC:RB+RA*4+offsetof(GCfuncL, uvptr)]
++ | mov RA, UPVAL:RB->v
+++ |.if SSE
++ | movsd qword [RA], xmm0
+++ |.else
+++ | fstp qword [RA]
+++ |.endif
++ | ins_next
++ break;
++ case BC_USETP:
++@@ -4267,10 +4846,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ |.else
++ | // Convert number to int and back and compare.
++ | checknum RC, >5
+++ |.if SSE
++ | movsd xmm0, qword [BASE+RC*8]
++ | cvttsd2si RC, xmm0
++ | cvtsi2sd xmm1, RC
++ | ucomisd xmm0, xmm1
+++ |.else
+++ | fld qword [BASE+RC*8]
+++ | fist ARG1
+++ | fild ARG1
+++ | fcomparepp
+++ | mov RC, ARG1
+++ |.endif
++ | jne ->vmeta_tgetv // Generic numeric key? Use fallback.
++ |.endif
++ | cmp RC, TAB:RB->asize // Takes care of unordered, too.
++@@ -4399,8 +4986,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | mov TAB:RB, [BASE+RB*8]
++ |.if DUALNUM
++ | mov RC, dword [BASE+RC*8]
++- |.else
+++ |.elif SSE
++ | cvttsd2si RC, qword [BASE+RC*8]
+++ |.else
+++ | fld qword [BASE+RC*8]
+++ | fistp TMP1
+++ | mov RC, TMP1
++ |.endif
++ | cmp RC, TAB:RB->asize
++ | jae ->vmeta_tgetr // Not in array part? Use fallback.
++@@ -4433,10 +5024,18 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ |.else
++ | // Convert number to int and back and compare.
++ | checknum RC, >5
+++ |.if SSE
++ | movsd xmm0, qword [BASE+RC*8]
++ | cvttsd2si RC, xmm0
++ | cvtsi2sd xmm1, RC
++ | ucomisd xmm0, xmm1
+++ |.else
+++ | fld qword [BASE+RC*8]
+++ | fist ARG1
+++ | fild ARG1
+++ | fcomparepp
+++ | mov RC, ARG1
+++ |.endif
++ | jne ->vmeta_tsetv // Generic numeric key? Use fallback.
++ |.endif
++ | cmp RC, TAB:RB->asize // Takes care of unordered, too.
++@@ -4611,8 +5210,12 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | mov TAB:RB, [BASE+RB*8]
++ |.if DUALNUM
++ | mov RC, dword [BASE+RC*8]
++- |.else
+++ |.elif SSE
++ | cvttsd2si RC, qword [BASE+RC*8]
+++ |.else
+++ | fld qword [BASE+RC*8]
+++ | fistp TMP1
+++ | mov RC, TMP1
++ |.endif
++ | test byte TAB:RB->marked, LJ_GC_BLACK // isblack(table)
++ | jnz >7
++@@ -4833,8 +5436,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ |.if DUALNUM
++ | mov dword [BASE+RA*8+4], LJ_TISNUM
++ | mov dword [BASE+RA*8], RC
++- |.else
+++ |.elif SSE
++ | cvtsi2sd xmm0, RC
+++ |.else
+++ | fild dword [BASE+RA*8-8]
++ |.endif
++ | // Copy array slot to returned value.
++ |.if X64
++@@ -4850,8 +5455,10 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | // Return array index as a numeric key.
++ |.if DUALNUM
++ | // See above.
++- |.else
+++ |.elif SSE
++ | movsd qword [BASE+RA*8], xmm0
+++ |.else
+++ | fstp qword [BASE+RA*8]
++ |.endif
++ | mov [BASE+RA*8-8], RC // Update control var.
++ |2:
++@@ -4864,6 +5471,9 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ |
++ |4: // Skip holes in array part.
++ | add RC, 1
+++ |.if not (DUALNUM or SSE)
+++ | mov [BASE+RA*8-8], RC
+++ |.endif
++ | jmp <1
++ |
++ |5: // Traverse hash part.
++@@ -5211,6 +5821,7 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ if (!vk) {
++ | cmp RB, LJ_TISNUM; jae ->vmeta_for
++ }
+++ |.if SSE
++ | movsd xmm0, qword FOR_IDX
++ | movsd xmm1, qword FOR_STOP
++ if (vk) {
++@@ -5223,6 +5834,22 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ | ucomisd xmm1, xmm0
++ |1:
++ | movsd qword FOR_EXT, xmm0
+++ |.else
+++ | fld qword FOR_STOP
+++ | fld qword FOR_IDX
+++ if (vk) {
+++ | fadd qword FOR_STEP // nidx = idx + step
+++ | fst qword FOR_IDX
+++ | fst qword FOR_EXT
+++ | test RB, RB; js >1
+++ } else {
+++ | fst qword FOR_EXT
+++ | jl >1
+++ }
+++ | fxch // Swap lim/(n)idx if step non-negative.
+++ |1:
+++ | fcomparepp
+++ |.endif
++ if (op == BC_FORI) {
++ |.if DUALNUM
++ | jnb <7
++@@ -5250,10 +5877,11 @@ static void build_ins(BuildCtx *ctx, BCOp op, int defop)
++ |2:
++ | ins_next
++ |.endif
++- |
+++ |.if SSE
++ |3: // Invert comparison if step is negative.
++ | ucomisd xmm0, xmm1
++ | jmp <1
+++ |.endif
++ break;
++
++ case BC_ITERL:
+--
+2.25.1
+