aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPo-Chuan Hsieh <sunpoet@FreeBSD.org>2022-07-11 13:51:34 +0000
committerPo-Chuan Hsieh <sunpoet@FreeBSD.org>2022-07-11 13:51:34 +0000
commita8ebe6902a15a11102372d0575c18cc9a01f19b1 (patch)
treec35b7b216ad081f6fd392af310ac0c1a955a6c5a
parent19051b3d996c9ec8dc2d43d925bb54b9a2116a21 (diff)
downloadports-a8ebe6902a15a11102372d0575c18cc9a01f19b1.tar.gz
ports-a8ebe6902a15a11102372d0575c18cc9a01f19b1.zip
security/nettle: Fix build of assembly code on aarch64
PR: 264946 Reported by: diizzy Tested by: diizzy (RockPro64 with GnuTLS on 13.1-RELEASE) Obtained from: https://git.lysator.liu.se/nettle/nettle/-/commit/d4c7597e4236f746434c9a1a24f6191f7ff870cd
-rw-r--r--security/nettle/Makefile4
-rw-r--r--security/nettle/files/patch-arm64-chacha-4core.asm146
-rw-r--r--security/nettle/files/patch-fat-arm64.c27
-rw-r--r--security/nettle/files/patch-powerpc64-p7-chacha-4core.asm130
-rw-r--r--security/nettle/files/patch-s390x-vf-chacha-4core.asm130
5 files changed, 433 insertions, 4 deletions
diff --git a/security/nettle/Makefile b/security/nettle/Makefile
index c3196b222ad4..2242322e6c36 100644
--- a/security/nettle/Makefile
+++ b/security/nettle/Makefile
@@ -36,10 +36,6 @@ EXAMPLES_USES= ssl
.include <bsd.port.options.mk>
-.if ${ARCH} == "aarch64"
-CONFIGURE_ARGS+=--disable-assembler
-.endif
-
.if ${ARCH} == "sparc64"
CONFIGURE_ENV+= CCPIC=-fPIC
.endif
diff --git a/security/nettle/files/patch-arm64-chacha-4core.asm b/security/nettle/files/patch-arm64-chacha-4core.asm
new file mode 100644
index 000000000000..2375fa618f1e
--- /dev/null
+++ b/security/nettle/files/patch-arm64-chacha-4core.asm
@@ -0,0 +1,146 @@
+Obtained from: https://git.lysator.liu.se/nettle/nettle/-/commit/d4c7597e4236f746434c9a1a24f6191f7ff870cd
+
+--- arm64/chacha-4core.asm.orig 2022-06-02 17:57:16 UTC
++++ arm64/chacha-4core.asm
+@@ -53,67 +53,74 @@ define(`TMP3', `v7')
+
+ define(`ROT24', `v8')
+
++C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
++C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
++define(`P1',
++`ifelse($1, 0, v16, $1, 1, v17, $1, 2, v18, $1, 3, v19, $1, 4, v20, $1, 5, v21, $1, 6, v22, $1, 7, v23, $1, 8, v24, $1, 9, v25, $1, 10, v26, $1, 11, v27, $1, 12, v28, $1, 13, v29, $1, 14, v30, $1, 15, v31)')
++define(`P2',
++`ifelse($1, 0, v16, $1, 1, v21, $1, 2, v26, $1, 3, v31, $1, 4, v20, $1, 5, v25, $1, 6, v30, $1, 7, v19, $1, 8, v24, $1, 9, v29, $1, 10, v18, $1, 11, v23, $1, 12, v28, $1, 13, v17, $1, 14, v22, $1, 15, v27)')
++
+ C Main loop for round
+ define(`QR',`
+- add $1.4s, $1.4s, $2.4s
+- add $5.4s, $5.4s, $6.4s
+- add $9.4s, $9.4s, $10.4s
+- add $13.4s, $13.4s, $14.4s
+- eor $4.16b, $4.16b, $1.16b
+- eor $8.16b, $8.16b, $5.16b
+- eor $12.16b, $12.16b, $9.16b
+- eor $16.16b, $16.16b, $13.16b
+- rev32 $4.8h, $4.8h
+- rev32 $8.8h, $8.8h
+- rev32 $12.8h, $12.8h
+- rev32 $16.8h, $16.8h
++ add $1(0).4s, $1(0).4s, $1(1).4s
++ add $1(4).4s, $1(4).4s, $1(5).4s
++ add $1(8).4s, $1(8).4s, $1(9).4s
++ add $1(12).4s, $1(12).4s, $1(13).4s
++ eor $1(3).16b, $1(3).16b, $1(0).16b
++ eor $1(7).16b, $1(7).16b, $1(4).16b
++ eor $1(11).16b, $1(11).16b, $1(8).16b
++ eor $1(15).16b, $1(15).16b, $1(12).16b
++ rev32 $1(3).8h, $1(3).8h
++ rev32 $1(7).8h, $1(7).8h
++ rev32 $1(11).8h, $1(11).8h
++ rev32 $1(15).8h, $1(15).8h
+
+- add $3.4s, $3.4s, $4.4s
+- add $7.4s, $7.4s, $8.4s
+- add $11.4s, $11.4s, $12.4s
+- add $15.4s, $15.4s, $16.4s
+- eor TMP0.16b, $2.16b, $3.16b
+- eor TMP1.16b, $6.16b, $7.16b
+- eor TMP2.16b, $10.16b, $11.16b
+- eor TMP3.16b, $14.16b, $15.16b
+- ushr $2.4s, TMP0.4s, #20
+- ushr $6.4s, TMP1.4s, #20
+- ushr $10.4s, TMP2.4s, #20
+- ushr $14.4s, TMP3.4s, #20
+- sli $2.4s, TMP0.4s, #12
+- sli $6.4s, TMP1.4s, #12
+- sli $10.4s, TMP2.4s, #12
+- sli $14.4s, TMP3.4s, #12
++ add $1(2).4s, $1(2).4s, $1(3).4s
++ add $1(6).4s, $1(6).4s, $1(7).4s
++ add $1(10).4s, $1(10).4s, $1(11).4s
++ add $1(14).4s, $1(14).4s, $1(15).4s
++ eor TMP0.16b, $1(1).16b, $1(2).16b
++ eor TMP1.16b, $1(5).16b, $1(6).16b
++ eor TMP2.16b, $1(9).16b, $1(10).16b
++ eor TMP3.16b, $1(13).16b, $1(14).16b
++ ushr $1(1).4s, TMP0.4s, #20
++ ushr $1(5).4s, TMP1.4s, #20
++ ushr $1(9).4s, TMP2.4s, #20
++ ushr $1(13).4s, TMP3.4s, #20
++ sli $1(1).4s, TMP0.4s, #12
++ sli $1(5).4s, TMP1.4s, #12
++ sli $1(9).4s, TMP2.4s, #12
++ sli $1(13).4s, TMP3.4s, #12
+
+- add $1.4s, $1.4s, $2.4s
+- add $5.4s, $5.4s, $6.4s
+- add $9.4s, $9.4s, $10.4s
+- add $13.4s, $13.4s, $14.4s
+- eor $4.16b, $4.16b, $1.16b
+- eor $8.16b, $8.16b, $5.16b
+- eor $12.16b, $12.16b, $9.16b
+- eor $16.16b, $16.16b, $13.16b
+- tbl $4.16b, {$4.16b}, ROT24.16b
+- tbl $8.16b, {$8.16b}, ROT24.16b
+- tbl $12.16b, {$12.16b}, ROT24.16b
+- tbl $16.16b, {$16.16b}, ROT24.16b
++ add $1(0).4s, $1(0).4s, $1(1).4s
++ add $1(4).4s, $1(4).4s, $1(5).4s
++ add $1(8).4s, $1(8).4s, $1(9).4s
++ add $1(12).4s, $1(12).4s, $1(13).4s
++ eor $1(3).16b, $1(3).16b, $1(0).16b
++ eor $1(7).16b, $1(7).16b, $1(4).16b
++ eor $1(11).16b, $1(11).16b, $1(8).16b
++ eor $1(15).16b, $1(15).16b, $1(12).16b
++ tbl $1(3).16b, {$1(3).16b}, ROT24.16b
++ tbl $1(7).16b, {$1(7).16b}, ROT24.16b
++ tbl $1(11).16b, {$1(11).16b}, ROT24.16b
++ tbl $1(15).16b, {$1(15).16b}, ROT24.16b
+
+- add $3.4s, $3.4s, $4.4s
+- add $7.4s, $7.4s, $8.4s
+- add $11.4s, $11.4s, $12.4s
+- add $15.4s, $15.4s, $16.4s
+- eor TMP0.16b, $2.16b, $3.16b
+- eor TMP1.16b, $6.16b, $7.16b
+- eor TMP2.16b, $10.16b, $11.16b
+- eor TMP3.16b, $14.16b, $15.16b
+- ushr $2.4s, TMP0.4s, #25
+- ushr $6.4s, TMP1.4s, #25
+- ushr $10.4s, TMP2.4s, #25
+- ushr $14.4s, TMP3.4s, #25
+- sli $2.4s, TMP0.4s, #7
+- sli $6.4s, TMP1.4s, #7
+- sli $10.4s, TMP2.4s, #7
+- sli $14.4s, TMP3.4s, #7
++ add $1(2).4s, $1(2).4s, $1(3).4s
++ add $1(6).4s, $1(6).4s, $1(7).4s
++ add $1(10).4s, $1(10).4s, $1(11).4s
++ add $1(14).4s, $1(14).4s, $1(15).4s
++ eor TMP0.16b, $1(1).16b, $1(2).16b
++ eor TMP1.16b, $1(5).16b, $1(6).16b
++ eor TMP2.16b, $1(9).16b, $1(10).16b
++ eor TMP3.16b, $1(13).16b, $1(14).16b
++ ushr $1(1).4s, TMP0.4s, #25
++ ushr $1(5).4s, TMP1.4s, #25
++ ushr $1(9).4s, TMP2.4s, #25
++ ushr $1(13).4s, TMP3.4s, #25
++ sli $1(1).4s, TMP0.4s, #7
++ sli $1(5).4s, TMP1.4s, #7
++ sli $1(9).4s, TMP2.4s, #7
++ sli $1(13).4s, TMP3.4s, #7
+ ')
+
+ define(`TRANSPOSE',`
+@@ -174,8 +181,8 @@ C Load state and splat
+ mov T3.16b, v31.16b
+
+ .Loop:
+- QR(v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+- QR(v16, v21, v26, v31, v20, v25, v30, v19, v24, v29, v18, v23, v28, v17, v22, v27)
++ QR(`P1')
++ QR(`P2')
+ subs ROUNDS, ROUNDS, #2
+ b.ne .Loop
+
diff --git a/security/nettle/files/patch-fat-arm64.c b/security/nettle/files/patch-fat-arm64.c
new file mode 100644
index 000000000000..b1e9cefc943f
--- /dev/null
+++ b/security/nettle/files/patch-fat-arm64.c
@@ -0,0 +1,27 @@
+--- fat-arm64.c.orig 2022-06-02 17:57:15 UTC
++++ fat-arm64.c
+@@ -46,6 +46,9 @@
+ # include <asm/hwcap.h>
+ # include <sys/auxv.h>
+ # endif
++#elif defined(__FreeBSD__)
++# define USE_GETAUXVAL 1
++# include <sys/auxv.h>
+ #endif
+
+ #include "nettle-types.h"
+@@ -113,7 +116,14 @@ get_arm64_features (struct arm64_features *features)
+ else
+ {
+ #if USE_GETAUXVAL
++#if defined (__FreeBSD__)
++ unsigned long hwcap;
++ if(elf_aux_info(AT_HWCAP, &hwcap, sizeof(unsigned long)) != 0) {
++ hwcap = 0;
++ }
++#else
+ unsigned long hwcap = getauxval(AT_HWCAP);
++#endif
+ features->have_aes
+ = ((hwcap & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES));
+ features->have_pmull
diff --git a/security/nettle/files/patch-powerpc64-p7-chacha-4core.asm b/security/nettle/files/patch-powerpc64-p7-chacha-4core.asm
new file mode 100644
index 000000000000..bb1032117cdb
--- /dev/null
+++ b/security/nettle/files/patch-powerpc64-p7-chacha-4core.asm
@@ -0,0 +1,130 @@
+Obtained from: https://git.lysator.liu.se/nettle/nettle/-/commit/d4c7597e4236f746434c9a1a24f6191f7ff870cd
+
+--- powerpc64/p7/chacha-4core.asm.orig 2022-06-02 17:57:16 UTC
++++ powerpc64/p7/chacha-4core.asm
+@@ -53,59 +53,66 @@ define(`T1', `v21')
+ define(`T2', `v22')
+ define(`T3', `v23')
+
++C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
++C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
++define(`P1',
++`ifelse($1, 0, v0, $1, 1, v4, $1, 2, v8, $1, 3, v12, $1, 4, v1, $1, 5, v5, $1, 6, v9, $1, 7, v13, $1, 8, v2, $1, 9, v6, $1, 10, v10, $1, 11, v14, $1, 12, v3, $1, 13, v7, $1, 14, v11, $1, 15, v15)')
++define(`P2',
++`ifelse($1, 0, v0, $1, 1, v5, $1, 2, v10, $1, 3, v15, $1, 4, v1, $1, 5, v6, $1, 6, v11, $1, 7, v12, $1, 8, v2, $1, 9, v7, $1, 10, v8, $1, 11, v13, $1, 12, v3, $1, 13, v4, $1, 14, v9, $1, 15, v14)')
++
+ C Main loop for round
+ define(`QR',`
+- vadduwm $1, $1, $2
+- vadduwm $5, $5, $6
+- vadduwm $9, $9, $10
+- vadduwm $13, $13, $14
+- vxor $4, $4, $1
+- vxor $8, $8, $5
+- vxor $12, $12, $9
+- vxor $16, $16, $13
+- vrlw $4, $4, ROT16
+- vrlw $8, $8, ROT16
+- vrlw $12, $12, ROT16
+- vrlw $16, $16, ROT16
++ vadduwm $1(0), $1(0), $1(1)
++ vadduwm $1(4), $1(4), $1(5)
++ vadduwm $1(8), $1(8), $1(9)
++ vadduwm $1(12), $1(12), $1(13)
++ vxor $1(3), $1(3), $1(0)
++ vxor $1(7), $1(7), $1(4)
++ vxor $1(11), $1(11), $1(8)
++ vxor $1(15), $1(15), $1(12)
++ vrlw $1(3), $1(3), ROT16
++ vrlw $1(7), $1(7), ROT16
++ vrlw $1(11), $1(11), ROT16
++ vrlw $1(15), $1(15), ROT16
+
+- vadduwm $3, $3, $4
+- vadduwm $7, $7, $8
+- vadduwm $11, $11, $12
+- vadduwm $15, $15, $16
+- vxor $2, $2, $3
+- vxor $6, $6, $7
+- vxor $10, $10, $11
+- vxor $14, $14, $15
+- vrlw $2, $2, ROT12
+- vrlw $6, $6, ROT12
+- vrlw $10, $10, ROT12
+- vrlw $14, $14, ROT12
++ vadduwm $1(2), $1(2), $1(3)
++ vadduwm $1(6), $1(6), $1(7)
++ vadduwm $1(10), $1(10), $1(11)
++ vadduwm $1(14), $1(14), $1(15)
++ vxor $1(1), $1(1), $1(2)
++ vxor $1(5), $1(5), $1(6)
++ vxor $1(9), $1(9), $1(10)
++ vxor $1(13), $1(13), $1(14)
++ vrlw $1(1), $1(1), ROT12
++ vrlw $1(5), $1(5), ROT12
++ vrlw $1(9), $1(9), ROT12
++ vrlw $1(13), $1(13), ROT12
+
+- vadduwm $1, $1, $2
+- vadduwm $5, $5, $6
+- vadduwm $9, $9, $10
+- vadduwm $13, $13, $14
+- vxor $4, $4, $1
+- vxor $8, $8, $5
+- vxor $12, $12, $9
+- vxor $16, $16, $13
+- vrlw $4, $4, ROT8
+- vrlw $8, $8, ROT8
+- vrlw $12, $12, ROT8
+- vrlw $16, $16, ROT8
++ vadduwm $1(0), $1(0), $1(1)
++ vadduwm $1(4), $1(4), $1(5)
++ vadduwm $1(8), $1(8), $1(9)
++ vadduwm $1(12), $1(12), $1(13)
++ vxor $1(3), $1(3), $1(0)
++ vxor $1(7), $1(7), $1(4)
++ vxor $1(11), $1(11), $1(8)
++ vxor $1(15), $1(15), $1(12)
++ vrlw $1(3), $1(3), ROT8
++ vrlw $1(7), $1(7), ROT8
++ vrlw $1(11), $1(11), ROT8
++ vrlw $1(15), $1(15), ROT8
+
+- vadduwm $3, $3, $4
+- vadduwm $7, $7, $8
+- vadduwm $11, $11, $12
+- vadduwm $15, $15, $16
+- vxor $2, $2, $3
+- vxor $6, $6, $7
+- vxor $10, $10, $11
+- vxor $14, $14, $15
+- vrlw $2, $2, ROT7
+- vrlw $6, $6, ROT7
+- vrlw $10, $10, ROT7
+- vrlw $14, $14, ROT7
++ vadduwm $1(2), $1(2), $1(3)
++ vadduwm $1(6), $1(6), $1(7)
++ vadduwm $1(10), $1(10), $1(11)
++ vadduwm $1(14), $1(14), $1(15)
++ vxor $1(1), $1(1), $1(2)
++ vxor $1(5), $1(5), $1(6)
++ vxor $1(9), $1(9), $1(10)
++ vxor $1(13), $1(13), $1(14)
++ vrlw $1(1), $1(1), ROT7
++ vrlw $1(5), $1(5), ROT7
++ vrlw $1(9), $1(9), ROT7
++ vrlw $1(13), $1(13), ROT7
+ ')
+
+ define(`TRANSPOSE',`
+@@ -185,8 +192,8 @@ C Load state and splat
+ srdi ROUNDS, ROUNDS, 1
+ mtctr ROUNDS
+ .Loop:
+- QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
+- QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
++ QR(`P1')
++ QR(`P2')
+ bdnz .Loop
+
+ C Add in saved original words, including counters, before
diff --git a/security/nettle/files/patch-s390x-vf-chacha-4core.asm b/security/nettle/files/patch-s390x-vf-chacha-4core.asm
new file mode 100644
index 000000000000..23cb5766a37b
--- /dev/null
+++ b/security/nettle/files/patch-s390x-vf-chacha-4core.asm
@@ -0,0 +1,130 @@
+Obtained from: https://git.lysator.liu.se/nettle/nettle/-/commit/d4c7597e4236f746434c9a1a24f6191f7ff870cd
+
+--- s390x/vf/chacha-4core.asm.orig 2022-06-02 17:57:16 UTC
++++ s390x/vf/chacha-4core.asm
+@@ -48,59 +48,66 @@ define(`T3', `%v28')
+ define(`T2', `%v27')
+ define(`T3', `%v28')
+
++C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
++C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
++define(`P1',
++`ifelse($1, 0, v0, $1, 1, v1, $1, 2, v2, $1, 3, v3, $1, 4, v4, $1, 5, v5, $1, 6, v6, $1, 7, v7, $1, 8, v8, $1, 9, v9, $1, 10, v10, $1, 11, v11, $1, 12, v12, $1, 13, v13, $1, 14, v14, $1, 15, v15)')
++define(`P2',
++`ifelse($1, 0, v0, $1, 1, v5, $1, 2, v10, $1, 3, v15, $1, 4, v4, $1, 5, v9, $1, 6, v14, $1, 7, v3, $1, 8, v8, $1, 9, v13, $1, 10, v2, $1, 11, v7, $1, 12, v12, $1, 13, v1, $1, 14, v6, $1, 15, v11)')
++
+ C Main loop for round
+ define(`QR',`
+- vaf $1, $1, $2
+- vaf $5, $5, $6
+- vaf $9, $9, $10
+- vaf $13, $13, $14
+- vx $4, $4, $1
+- vx $8, $8, $5
+- vx $12, $12, $9
+- vx $16, $16, $13
+- verllf $4, $4, 16
+- verllf $8, $8, 16
+- verllf $12, $12, 16
+- verllf $16, $16, 16
++ vaf $1(0), $1(0), $1(1)
++ vaf $1(4), $1(4), $1(5)
++ vaf $1(8), $1(8), $1(9)
++ vaf $1(12), $1(12), $1(13)
++ vx $1(3), $1(3), $1(0)
++ vx $1(7), $1(7), $1(4)
++ vx $1(11), $1(11), $1(8)
++ vx $1(15), $1(15), $1(12)
++ verllf $1(3), $1(3), 16
++ verllf $1(7), $1(7), 16
++ verllf $1(11), $1(11), 16
++ verllf $1(15), $1(15), 16
+
+- vaf $3, $3, $4
+- vaf $7, $7, $8
+- vaf $11, $11, $12
+- vaf $15, $15, $16
+- vx $2, $2, $3
+- vx $6, $6, $7
+- vx $10, $10, $11
+- vx $14, $14, $15
+- verllf $2, $2, 12
+- verllf $6, $6, 12
+- verllf $10, $10, 12
+- verllf $14, $14, 12
++ vaf $1(2), $1(2), $1(3)
++ vaf $1(6), $1(6), $1(7)
++ vaf $1(10), $1(10), $1(11)
++ vaf $1(14), $1(14), $1(15)
++ vx $1(1), $1(1), $1(2)
++ vx $1(5), $1(5), $1(6)
++ vx $1(9), $1(9), $1(10)
++ vx $1(13), $1(13), $1(14)
++ verllf $1(1), $1(1), 12
++ verllf $1(5), $1(5), 12
++ verllf $1(9), $1(9), 12
++ verllf $1(13), $1(13), 12
+
+- vaf $1, $1, $2
+- vaf $5, $5, $6
+- vaf $9, $9, $10
+- vaf $13, $13, $14
+- vx $4, $4, $1
+- vx $8, $8, $5
+- vx $12, $12, $9
+- vx $16, $16, $13
+- verllf $4, $4, 8
+- verllf $8, $8, 8
+- verllf $12, $12, 8
+- verllf $16, $16, 8
++ vaf $1(0), $1(0), $1(1)
++ vaf $1(4), $1(4), $1(5)
++ vaf $1(8), $1(8), $1(9)
++ vaf $1(12), $1(12), $1(13)
++ vx $1(3), $1(3), $1(0)
++ vx $1(7), $1(7), $1(4)
++ vx $1(11), $1(11), $1(8)
++ vx $1(15), $1(15), $1(12)
++ verllf $1(3), $1(3), 8
++ verllf $1(7), $1(7), 8
++ verllf $1(11), $1(11), 8
++ verllf $1(15), $1(15), 8
+
+- vaf $3, $3, $4
+- vaf $7, $7, $8
+- vaf $11, $11, $12
+- vaf $15, $15, $16
+- vx $2, $2, $3
+- vx $6, $6, $7
+- vx $10, $10, $11
+- vx $14, $14, $15
+- verllf $2, $2, 7
+- verllf $6, $6, 7
+- verllf $10, $10, 7
+- verllf $14, $14, 7
++ vaf $1(2), $1(2), $1(3)
++ vaf $1(6), $1(6), $1(7)
++ vaf $1(10), $1(10), $1(11)
++ vaf $1(14), $1(14), $1(15)
++ vx $1(1), $1(1), $1(2)
++ vx $1(5), $1(5), $1(6)
++ vx $1(9), $1(9), $1(10)
++ vx $1(13), $1(13), $1(14)
++ verllf $1(1), $1(1), 7
++ verllf $1(5), $1(5), 7
++ verllf $1(9), $1(9), 7
++ verllf $1(13), $1(13), 7
+ ')
+
+ define(`TRANSPOSE',`
+@@ -176,8 +183,8 @@ C Load state and splat
+ srlg ROUNDS, ROUNDS, 1
+
+ .Loop:
+- QR(%v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7, %v8, %v9, %v10, %v11, %v12, %v13, %v14, %v15)
+- QR(%v0, %v5, %v10, %v15, %v4, %v9, %v14, %v3, %v8, %v13, %v2, %v7, %v12, %v1, %v6, %v11)
++ QR(`P1')
++ QR(`P2')
+ brctg ROUNDS, .Loop
+
+ C Add in saved original words, including counters, before