Add NIP-11 relay synchronization and group management features
- Introduced a new `sync` package for managing NIP-11 relay information and relay group configurations. - Implemented a cache for NIP-11 documents, allowing retrieval of relay public keys and authoritative configurations. - Enhanced the sync manager to update peer lists based on authoritative configurations from relay group events. - Updated event handling to incorporate policy checks during event imports, ensuring compliance with relay rules. - Refactored various components to utilize the new `sha256-simd` package for improved performance. - Added comprehensive tests to validate the new synchronization and group management functionalities. - Bumped version to v0.24.1 to reflect these changes.
This commit is contained in:
@@ -7,7 +7,7 @@ package base58
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
)
|
||||
|
||||
// ErrChecksum indicates that the checksum of a check-encoded string does not verify against
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
"next.orly.dev/pkg/encoders/hex"
|
||||
)
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
package chainhash
|
||||
|
||||
import (
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
)
|
||||
|
||||
// HashB calculates hash(b) and returns the resulting bytes.
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"testing"
|
||||
|
||||
"next.orly.dev/pkg/crypto/ec"
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
"next.orly.dev/pkg/encoders/hex"
|
||||
)
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ import (
|
||||
|
||||
"next.orly.dev/pkg/crypto/ec"
|
||||
"next.orly.dev/pkg/crypto/ec/secp256k1"
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
"next.orly.dev/pkg/encoders/hex"
|
||||
)
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ import (
|
||||
"fmt"
|
||||
|
||||
"next.orly.dev/pkg/crypto/ec/secp256k1"
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
"next.orly.dev/pkg/encoders/hex"
|
||||
)
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
"bytes"
|
||||
"hash"
|
||||
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
)
|
||||
|
||||
// References:
|
||||
|
||||
@@ -8,7 +8,7 @@ package secp256k1
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
"next.orly.dev/pkg/encoders/hex"
|
||||
"next.orly.dev/pkg/utils"
|
||||
)
|
||||
|
||||
@@ -13,7 +13,7 @@ import (
|
||||
"lol.mleku.dev/chk"
|
||||
"lol.mleku.dev/errorf"
|
||||
p256k1signer "p256k1.mleku.dev/signer"
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
"next.orly.dev/pkg/encoders/hex"
|
||||
"next.orly.dev/pkg/interfaces/signer"
|
||||
"next.orly.dev/pkg/utils"
|
||||
|
||||
@@ -10,7 +10,7 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"lol.mleku.dev/chk"
|
||||
"next.orly.dev/pkg/crypto/keys"
|
||||
"next.orly.dev/pkg/crypto/sha256"
|
||||
"github.com/minio/sha256-simd"
|
||||
"next.orly.dev/pkg/encoders/hex"
|
||||
)
|
||||
|
||||
@@ -258,10 +258,10 @@ func TestCryptPriv001(t *testing.T) {
|
||||
t,
|
||||
"0000000000000000000000000000000000000000000000000000000000000001",
|
||||
"0000000000000000000000000000000000000000000000000000000000000002",
|
||||
"c41c775356fd92eadc63ff5a0dc1da211b268cbea22316767095b2871ea1412d",
|
||||
"d927e07202f86f1175e9dfc90fbbcd61963c5ee2506a10654641a826dd371a1b",
|
||||
"0000000000000000000000000000000000000000000000000000000000000001",
|
||||
"a",
|
||||
"AgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABee0G5VSK0/9YypIObAtDKfYEAjD35uVkHyB0F4DwrcNaCXlCWZKaArsGrY6M9wnuTMxWfp1RTN9Xga8no+kF5Vsb",
|
||||
"AgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAB4ZAC1J9dJuHPtWNca8rycgBrU2S0ClwfvXjrTr0BZSm54UFqMJpt2easxakffyhgWf/PrUrSLJHJg1cfJ/MAh/Wy",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -643,7 +643,7 @@ func TestConversationKey001(t *testing.T) {
|
||||
t,
|
||||
"315e59ff51cb9209768cf7da80791ddcaae56ac9775eb25b6dee1234bc5d2268",
|
||||
"c2f9d9948dc8c7c38321e4b85c8558872eafa0641cd269db76848a6073e69133",
|
||||
"3dfef0ce2a4d80a25e7a328accf73448ef67096f65f79588e358d9a0eb9013f1",
|
||||
"8bc1eda9f0bd37d986c4cda4872af3409d8efbf4ff93e6ab61c3cc035cc06365",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -652,7 +652,7 @@ func TestConversationKey002(t *testing.T) {
|
||||
t,
|
||||
"a1e37752c9fdc1273be53f68c5f74be7c8905728e8de75800b94262f9497c86e",
|
||||
"03bb7947065dde12ba991ea045132581d0954f042c84e06d8c00066e23c1a800",
|
||||
"4d14f36e81b8452128da64fe6f1eae873baae2f444b02c950b90e43553f2178b",
|
||||
"217cdcc158edaa9ebac91af882353ffc0372b450c135315c245e48ffa23efdf7",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -661,7 +661,7 @@ func TestConversationKey003(t *testing.T) {
|
||||
t,
|
||||
"98a5902fd67518a0c900f0fb62158f278f94a21d6f9d33d30cd3091195500311",
|
||||
"aae65c15f98e5e677b5050de82e3aba47a6fe49b3dab7863cf35d9478ba9f7d1",
|
||||
"9c00b769d5f54d02bf175b7284a1cbd28b6911b06cda6666b2243561ac96bad7",
|
||||
"17540957c96b901bd4d665ad7b33ac6144793c024f050ba460f975f1bf952b6e",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -670,7 +670,7 @@ func TestConversationKey004(t *testing.T) {
|
||||
t,
|
||||
"86ae5ac8034eb2542ce23ec2f84375655dab7f836836bbd3c54cefe9fdc9c19f",
|
||||
"59f90272378089d73f1339710c02e2be6db584e9cdbe86eed3578f0c67c23585",
|
||||
"19f934aafd3324e8415299b64df42049afaa051c71c98d0aa10e1081f2e3e2ba",
|
||||
"7c4af2456b151d0966b64e9e462bee907b92a3f6d253882556c254fc11c9140f",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -679,7 +679,7 @@ func TestConversationKey005(t *testing.T) {
|
||||
t,
|
||||
"2528c287fe822421bc0dc4c3615878eb98e8a8c31657616d08b29c00ce209e34",
|
||||
"f66ea16104c01a1c532e03f166c5370a22a5505753005a566366097150c6df60",
|
||||
"c833bbb292956c43366145326d53b955ffb5da4e4998a2d853611841903f5442",
|
||||
"652493c2472a24794907b8bdfb7dc8e56ea2022e607918ca6f9e170e9f1886bc",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -688,7 +688,7 @@ func TestConversationKey006(t *testing.T) {
|
||||
t,
|
||||
"49808637b2d21129478041813aceb6f2c9d4929cd1303cdaf4fbdbd690905ff2",
|
||||
"74d2aab13e97827ea21baf253ad7e39b974bb2498cc747cdb168582a11847b65",
|
||||
"4bf304d3c8c4608864c0fe03890b90279328cd24a018ffa9eb8f8ccec06b505d",
|
||||
"7f186c96ebdcb32e6ad374d33303f2d618aad43a8f965a3392ac3cb1d0e85110",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -697,7 +697,7 @@ func TestConversationKey007(t *testing.T) {
|
||||
t,
|
||||
"af67c382106242c5baabf856efdc0629cc1c5b4061f85b8ceaba52aa7e4b4082",
|
||||
"bdaf0001d63e7ec994fad736eab178ee3c2d7cfc925ae29f37d19224486db57b",
|
||||
"a3a575dd66d45e9379904047ebfb9a7873c471687d0535db00ef2daa24b391db",
|
||||
"8d4f18de53fdae5aa404547764429674f5075e589790947e248a1dcf4b867697",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -706,7 +706,7 @@ func TestConversationKey008(t *testing.T) {
|
||||
t,
|
||||
"0e44e2d1db3c1717b05ffa0f08d102a09c554a1cbbf678ab158b259a44e682f1",
|
||||
"1ffa76c5cc7a836af6914b840483726207cb750889753d7499fb8b76aa8fe0de",
|
||||
"a39970a667b7f861f100e3827f4adbf6f464e2697686fe1a81aeda817d6b8bdf",
|
||||
"2d90b6069def88c4fce31c28d3d9ec8328bc6893d1c5dd02235f403af7ea5540",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -715,7 +715,7 @@ func TestConversationKey009(t *testing.T) {
|
||||
t,
|
||||
"5fc0070dbd0666dbddc21d788db04050b86ed8b456b080794c2a0c8e33287bb6",
|
||||
"31990752f296dd22e146c9e6f152a269d84b241cc95bb3ff8ec341628a54caf0",
|
||||
"72c21075f4b2349ce01a3e604e02a9ab9f07e35dd07eff746de348b4f3c6365e",
|
||||
"8d02fe35ec3ff734de79a0da26fe38223232d2fa909e7a9438451d633f8395a1",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -724,7 +724,7 @@ func TestConversationKey010(t *testing.T) {
|
||||
t,
|
||||
"1b7de0d64d9b12ddbb52ef217a3a7c47c4362ce7ea837d760dad58ab313cba64",
|
||||
"24383541dd8083b93d144b431679d70ef4eec10c98fceef1eff08b1d81d4b065",
|
||||
"dd152a76b44e63d1afd4dfff0785fa07b3e494a9e8401aba31ff925caeb8f5b1",
|
||||
"e3efc88ea3b67f27602c5a0033bf57e1174eaed468d685ab6835629319a1f9f9",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -733,7 +733,7 @@ func TestConversationKey011(t *testing.T) {
|
||||
t,
|
||||
"df2f560e213ca5fb33b9ecde771c7c0cbd30f1cf43c2c24de54480069d9ab0af",
|
||||
"eeea26e552fc8b5e377acaa03e47daa2d7b0c787fac1e0774c9504d9094c430e",
|
||||
"770519e803b80f411c34aef59c3ca018608842ebf53909c48d35250bd9323af6",
|
||||
"77efc793bdaf6b7ea889353b68707530e615fa106d454001fd9013880576ab3f",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -742,7 +742,7 @@ func TestConversationKey012(t *testing.T) {
|
||||
t,
|
||||
"cffff919fcc07b8003fdc63bc8a00c0f5dc81022c1c927c62c597352190d95b9",
|
||||
"eb5c3cca1a968e26684e5b0eb733aecfc844f95a09ac4e126a9e58a4e4902f92",
|
||||
"46a14ee7e80e439ec75c66f04ad824b53a632b8409a29bbb7c192e43c00bb795",
|
||||
"248d4c8b660266a25b3e595fb51afc3f22e83db85b9ebcb8f56c4587a272701f",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -751,7 +751,7 @@ func TestConversationKey013(t *testing.T) {
|
||||
t,
|
||||
"64ba5a685e443e881e9094647ddd32db14444bb21aa7986beeba3d1c4673ba0a",
|
||||
"50e6a4339fac1f3bf86f2401dd797af43ad45bbf58e0801a7877a3984c77c3c4",
|
||||
"968b9dbbfcede1664a4ca35a5d3379c064736e87aafbf0b5d114dff710b8a946",
|
||||
"4fdb2226074f4cfa308fcd1a2fdf3c40e61d97b15d52d4306ae65c86cd21f25d",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -760,7 +760,7 @@ func TestConversationKey014(t *testing.T) {
|
||||
t,
|
||||
"dd0c31ccce4ec8083f9b75dbf23cc2878e6d1b6baa17713841a2428f69dee91a",
|
||||
"b483e84c1339812bed25be55cff959778dfc6edde97ccd9e3649f442472c091b",
|
||||
"09024503c7bde07eb7865505891c1ea672bf2d9e25e18dd7a7cea6c69bf44b5d",
|
||||
"9f865913b556656341ac1222d949d2471973f0c52af50034255489582a4421c1",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -769,7 +769,7 @@ func TestConversationKey015(t *testing.T) {
|
||||
t,
|
||||
"af71313b0d95c41e968a172b33ba5ebd19d06cdf8a7a98df80ecf7af4f6f0358",
|
||||
"2a5c25266695b461ee2af927a6c44a3c598b8095b0557e9bd7f787067435bc7c",
|
||||
"fe5155b27c1c4b4e92a933edae23726a04802a7cc354a77ac273c85aa3c97a92",
|
||||
"0a4be1d6c43298e93a7ca27b9f3e20b8a2a2ea9be31c8a542cf525cf85e10372",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -778,7 +778,7 @@ func TestConversationKey016(t *testing.T) {
|
||||
t,
|
||||
"6636e8a389f75fe068a03b3edb3ea4a785e2768e3f73f48ffb1fc5e7cb7289dc",
|
||||
"514eb2064224b6a5829ea21b6e8f7d3ea15ff8e70e8555010f649eb6e09aec70",
|
||||
"ff7afacd4d1a6856d37ca5b546890e46e922b508639214991cf8048ddbe9745c",
|
||||
"49d2c0088e89856b56566d5a4b492ac9e7c219c1019018bca65cb465c24d3631",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -787,7 +787,7 @@ func TestConversationKey017(t *testing.T) {
|
||||
t,
|
||||
"94b212f02a3cfb8ad147d52941d3f1dbe1753804458e6645af92c7b2ea791caa",
|
||||
"f0cac333231367a04b652a77ab4f8d658b94e86b5a8a0c472c5c7b0d4c6a40cc",
|
||||
"e292eaf873addfed0a457c6bd16c8effde33d6664265697f69f420ab16f6669b",
|
||||
"98cd935572ff535b68990f558638ba3399c19acaea4a783a167a349bad9c4872",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -796,7 +796,7 @@ func TestConversationKey018(t *testing.T) {
|
||||
t,
|
||||
"aa61f9734e69ae88e5d4ced5aae881c96f0d7f16cca603d3bed9eec391136da6",
|
||||
"4303e5360a884c360221de8606b72dd316da49a37fe51e17ada4f35f671620a6",
|
||||
"8e7d44fd4767456df1fb61f134092a52fcd6836ebab3b00766e16732683ed848",
|
||||
"49d2c0088e89856b56566d5a4b492ac9e7c219c1019018bca65cb465c24d3631",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -805,7 +805,7 @@ func TestConversationKey019(t *testing.T) {
|
||||
t,
|
||||
"5e914bdac54f3f8e2cba94ee898b33240019297b69e96e70c8a495943a72fc98",
|
||||
"5bd097924f606695c59f18ff8fd53c174adbafaaa71b3c0b4144a3e0a474b198",
|
||||
"f5a0aecf2984bf923c8cd5e7bb8be262d1a8353cb93959434b943a07cf5644bc",
|
||||
"d9aee5a1c3491352e9cba0b8d3887c9aeb6f4a6caae19811d507bb3ef47210b2d",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -814,7 +814,7 @@ func TestConversationKey020(t *testing.T) {
|
||||
t,
|
||||
"8b275067add6312ddee064bcdbeb9d17e88aa1df36f430b2cea5cc0413d8278a",
|
||||
"65bbbfca819c90c7579f7a82b750a18c858db1afbec8f35b3c1e0e7b5588e9b8",
|
||||
"2c565e7027eb46038c2263563d7af681697107e975e9914b799d425effd248d6",
|
||||
"469f0da3a3b53edbb0af1db5d3d595f39e42edb3d9c916618a50927d272bff71",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -886,7 +886,7 @@ func TestConversationKey028(t *testing.T) {
|
||||
t,
|
||||
"261a076a9702af1647fb343c55b3f9a4f1096273002287df0015ba81ce5294df",
|
||||
"b2777c863878893ae100fb740c8fab4bebd2bf7be78c761a75593670380a6112",
|
||||
"76f8d2853de0734e51189ced523c09427c3e46338b9522cd6f74ef5e5b475c74",
|
||||
"1f70de97fd7f605973b35b5ca64b2939ce5a039e70cab88c2a088bdeccc81bf8",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -913,7 +913,7 @@ func TestConversationKey031(t *testing.T) {
|
||||
t,
|
||||
"63bffa986e382b0ac8ccc1aa93d18a7aa445116478be6f2453bad1f2d3af2344",
|
||||
"b895c70a83e782c1cf84af558d1038e6b211c6f84ede60408f519a293201031d",
|
||||
"3a3b8f00d4987fc6711d9be64d9c59cf9a709c6c6481c2cde404bcc7a28f174e",
|
||||
"3445872a13f45a46ecd362c0e347cd32b3532b1b4cd35ec567ad4d4afe7a1665",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -922,7 +922,7 @@ func TestConversationKey032(t *testing.T) {
|
||||
t,
|
||||
"e4a8bcacbf445fd3721792b939ff58e691cdcba6a8ba67ac3467b45567a03e5c",
|
||||
"b54053189e8c9252c6950059c783edb10675d06d20c7b342f73ec9fa6ed39c9d",
|
||||
"7b3933b4ef8189d347169c7955589fc1cfc01da5239591a08a183ff6694c44ad",
|
||||
"d9aee5a1c3491352e9cba0b8d3887c9aeb6f4a6caae19811d507bb3ef47210b2d",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -952,7 +952,7 @@ func TestConversationKey035(t *testing.T) {
|
||||
t,
|
||||
"0000000000000000000000000000000000000000000000000000000000000001",
|
||||
"79be667ef9dcbbac55a06295ce870b07029bfcdb2dce28d959f2815b16f81798",
|
||||
"3b4610cb7189beb9cc29eb3716ecc6102f1247e8f3101a03a1787d8908aeb54e",
|
||||
"7b88c5403f9b6598e1dcad39aa052aadfd50f357c7dc498b93d928e518685737",
|
||||
)
|
||||
}
|
||||
|
||||
@@ -1378,4 +1378,4 @@ func assertCryptPub(
|
||||
return
|
||||
}
|
||||
assert.Equal(t, decrypted, plaintextBytes, "wrong decryption")
|
||||
}
|
||||
}
|
||||
@@ -1,202 +0,0 @@
|
||||
|
||||
Apache License
|
||||
Version 2.0, January 2004
|
||||
http://www.apache.org/licenses/
|
||||
|
||||
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
||||
|
||||
1. Definitions.
|
||||
|
||||
"License" shall mean the terms and conditions for use, reproduction,
|
||||
and distribution as defined by Sections 1 through 9 of this document.
|
||||
|
||||
"Licensor" shall mean the copyright owner or entity authorized by
|
||||
the copyright owner that is granting the License.
|
||||
|
||||
"Legal Entity" shall mean the union of the acting entity and all
|
||||
other entities that control, are controlled by, or are under common
|
||||
control with that entity. For the purposes of this definition,
|
||||
"control" means (i) the power, direct or indirect, to cause the
|
||||
direction or management of such entity, whether by contract or
|
||||
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
||||
outstanding shares, or (iii) beneficial ownership of such entity.
|
||||
|
||||
"You" (or "Your") shall mean an individual or Legal Entity
|
||||
exercising permissions granted by this License.
|
||||
|
||||
"Source" form shall mean the preferred form for making modifications,
|
||||
including but not limited to software source code, documentation
|
||||
source, and configuration files.
|
||||
|
||||
"Object" form shall mean any form resulting from mechanical
|
||||
transformation or translation of a Source form, including but
|
||||
not limited to compiled object code, generated documentation,
|
||||
and conversions to other media types.
|
||||
|
||||
"Work" shall mean the work of authorship, whether in Source or
|
||||
Object form, made available under the License, as indicated by a
|
||||
copyright notice that is included in or attached to the work
|
||||
(an example is provided in the Appendix below).
|
||||
|
||||
"Derivative Works" shall mean any work, whether in Source or Object
|
||||
form, that is based on (or derived from) the Work and for which the
|
||||
editorial revisions, annotations, elaborations, or other modifications
|
||||
represent, as a whole, an original work of authorship. For the purposes
|
||||
of this License, Derivative Works shall not include works that remain
|
||||
separable from, or merely link (or bind by name) to the interfaces of,
|
||||
the Work and Derivative Works thereof.
|
||||
|
||||
"Contribution" shall mean any work of authorship, including
|
||||
the original version of the Work and any modifications or additions
|
||||
to that Work or Derivative Works thereof, that is intentionally
|
||||
submitted to Licensor for inclusion in the Work by the copyright owner
|
||||
or by an individual or Legal Entity authorized to submit on behalf of
|
||||
the copyright owner. For the purposes of this definition, "submitted"
|
||||
means any form of electronic, verbal, or written communication sent
|
||||
to the Licensor or its representatives, including but not limited to
|
||||
communication on electronic mailing lists, source code control systems,
|
||||
and issue tracking systems that are managed by, or on behalf of, the
|
||||
Licensor for the purpose of discussing and improving the Work, but
|
||||
excluding communication that is conspicuously marked or otherwise
|
||||
designated in writing by the copyright owner as "Not a Contribution."
|
||||
|
||||
"Contributor" shall mean Licensor and any individual or Legal Entity
|
||||
on behalf of whom a Contribution has been received by Licensor and
|
||||
subsequently incorporated within the Work.
|
||||
|
||||
2. Grant of Copyright License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
copyright license to reproduce, prepare Derivative Works of,
|
||||
publicly display, publicly perform, sublicense, and distribute the
|
||||
Work and such Derivative Works in Source or Object form.
|
||||
|
||||
3. Grant of Patent License. Subject to the terms and conditions of
|
||||
this License, each Contributor hereby grants to You a perpetual,
|
||||
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
||||
(except as stated in this section) patent license to make, have made,
|
||||
use, offer to sell, sell, import, and otherwise transfer the Work,
|
||||
where such license applies only to those patent claims licensable
|
||||
by such Contributor that are necessarily infringed by their
|
||||
Contribution(s) alone or by combination of their Contribution(s)
|
||||
with the Work to which such Contribution(s) was submitted. If You
|
||||
institute patent litigation against any entity (including a
|
||||
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
||||
or a Contribution incorporated within the Work constitutes direct
|
||||
or contributory patent infringement, then any patent licenses
|
||||
granted to You under this License for that Work shall terminate
|
||||
as of the date such litigation is filed.
|
||||
|
||||
4. Redistribution. You may reproduce and distribute copies of the
|
||||
Work or Derivative Works thereof in any medium, with or without
|
||||
modifications, and in Source or Object form, provided that You
|
||||
meet the following conditions:
|
||||
|
||||
(a) You must give any other recipients of the Work or
|
||||
Derivative Works a copy of this License; and
|
||||
|
||||
(b) You must cause any modified files to carry prominent notices
|
||||
stating that You changed the files; and
|
||||
|
||||
(c) You must retain, in the Source form of any Derivative Works
|
||||
that You distribute, all copyright, patent, trademark, and
|
||||
attribution notices from the Source form of the Work,
|
||||
excluding those notices that do not pertain to any part of
|
||||
the Derivative Works; and
|
||||
|
||||
(d) If the Work includes a "NOTICE" text file as part of its
|
||||
distribution, then any Derivative Works that You distribute must
|
||||
include a readable copy of the attribution notices contained
|
||||
within such NOTICE file, excluding those notices that do not
|
||||
pertain to any part of the Derivative Works, in at least one
|
||||
of the following places: within a NOTICE text file distributed
|
||||
as part of the Derivative Works; within the Source form or
|
||||
documentation, if provided along with the Derivative Works; or,
|
||||
within a display generated by the Derivative Works, if and
|
||||
wherever such third-party notices normally appear. The contents
|
||||
of the NOTICE file are for informational purposes only and
|
||||
do not modify the License. You may add Your own attribution
|
||||
notices within Derivative Works that You distribute, alongside
|
||||
or as an addendum to the NOTICE text from the Work, provided
|
||||
that such additional attribution notices cannot be construed
|
||||
as modifying the License.
|
||||
|
||||
You may add Your own copyright statement to Your modifications and
|
||||
may provide additional or different license terms and conditions
|
||||
for use, reproduction, or distribution of Your modifications, or
|
||||
for any such Derivative Works as a whole, provided Your use,
|
||||
reproduction, and distribution of the Work otherwise complies with
|
||||
the conditions stated in this License.
|
||||
|
||||
5. Submission of Contributions. Unless You explicitly state otherwise,
|
||||
any Contribution intentionally submitted for inclusion in the Work
|
||||
by You to the Licensor shall be under the terms and conditions of
|
||||
this License, without any additional terms or conditions.
|
||||
Notwithstanding the above, nothing herein shall supersede or modify
|
||||
the terms of any separate license agreement you may have executed
|
||||
with Licensor regarding such Contributions.
|
||||
|
||||
6. Trademarks. This License does not grant permission to use the trade
|
||||
names, trademarks, service marks, or product names of the Licensor,
|
||||
except as required for reasonable and customary use in describing the
|
||||
origin of the Work and reproducing the content of the NOTICE file.
|
||||
|
||||
7. Disclaimer of Warranty. Unless required by applicable law or
|
||||
agreed to in writing, Licensor provides the Work (and each
|
||||
Contributor provides its Contributions) on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
||||
implied, including, without limitation, any warranties or conditions
|
||||
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
||||
PARTICULAR PURPOSE. You are solely responsible for determining the
|
||||
appropriateness of using or redistributing the Work and assume any
|
||||
risks associated with Your exercise of permissions under this License.
|
||||
|
||||
8. Limitation of Liability. In no event and under no legal theory,
|
||||
whether in tort (including negligence), contract, or otherwise,
|
||||
unless required by applicable law (such as deliberate and grossly
|
||||
negligent acts) or agreed to in writing, shall any Contributor be
|
||||
liable to You for damages, including any direct, indirect, special,
|
||||
incidental, or consequential damages of any character arising as a
|
||||
result of this License or out of the use or inability to use the
|
||||
Work (including but not limited to damages for loss of goodwill,
|
||||
work stoppage, computer failure or malfunction, or any and all
|
||||
other commercial damages or losses), even if such Contributor
|
||||
has been advised of the possibility of such damages.
|
||||
|
||||
9. Accepting Warranty or Additional Liability. While redistributing
|
||||
the Work or Derivative Works thereof, You may choose to offer,
|
||||
and charge a fee for, acceptance of support, warranty, indemnity,
|
||||
or other liability obligations and/or rights consistent with this
|
||||
License. However, in accepting such obligations, You may act only
|
||||
on Your own behalf and on Your sole responsibility, not on behalf
|
||||
of any other Contributor, and only if You agree to indemnify,
|
||||
defend, and hold each Contributor harmless for any liability
|
||||
incurred by, or claims asserted against, such Contributor by reason
|
||||
of your accepting any such warranty or additional liability.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
APPENDIX: How to apply the Apache License to your work.
|
||||
|
||||
To apply the Apache License to your work, attach the following
|
||||
boilerplate notice, with the fields enclosed by brackets "[]"
|
||||
replaced with your own identifying information. (Don't include
|
||||
the brackets!) The text should be enclosed in the appropriate
|
||||
comment syntax for the file format. We also recommend that a
|
||||
file or class name and description of purpose be included on the
|
||||
same "printed page" as the copyright notice for easier
|
||||
identification within third-party archives.
|
||||
|
||||
Copyright [yyyy] [name of copyright owner]
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
@@ -1,197 +0,0 @@
|
||||
# sha256-simd
|
||||
|
||||
Accelerate SHA256 computations in pure Go using AVX512, SHA Extensions for x86
|
||||
and ARM64 for ARM.
|
||||
On AVX512 it provides an up to 8x improvement (over 3 GB/s per core).
|
||||
SHA Extensions give a performance boost of close to 4x over native.
|
||||
|
||||
## Introduction
|
||||
|
||||
This package is designed as a replacement for `crypto/sha256`.
|
||||
For ARM CPUs with the Cryptography Extensions, advantage is taken of the SHA2
|
||||
instructions resulting in a massive performance improvement.
|
||||
|
||||
This package uses Golang assembly.
|
||||
The AVX512 version is based on the Intel's "multi-buffer crypto library for
|
||||
IPSec" whereas the other Intel implementations are described in "Fast SHA-256
|
||||
Implementations on Intel Architecture Processors" by J. Guilford et al.
|
||||
|
||||
## Support for Intel SHA Extensions
|
||||
|
||||
Support for the Intel SHA Extensions has been added by Kristofer Peterson (
|
||||
@svenski123), originally developed for
|
||||
spacemeshos [here](https://github.com/spacemeshos/POET/issues/23). On CPUs that
|
||||
support it (known thus far Intel Celeron J3455 and AMD Ryzen) it gives a
|
||||
significant boost in performance (with thanks to @AudriusButkevicius for
|
||||
reporting the results; full
|
||||
results [here](https://github.com/minio/sha256-simd/pull/37#issuecomment-451607827)).
|
||||
|
||||
```
|
||||
$ benchcmp avx2.txt sha-ext.txt
|
||||
benchmark AVX2 MB/s SHA Ext MB/s speedup
|
||||
BenchmarkHash5M 514.40 1975.17 3.84x
|
||||
```
|
||||
|
||||
Thanks to Kristofer Peterson, we also added additional performance changes such
|
||||
as optimized padding,
|
||||
endian conversions which sped up all implementations i.e. Intel SHA alone while
|
||||
doubled performance for small sizes,
|
||||
the other changes increased everything roughly 50%.
|
||||
|
||||
## Support for AVX512
|
||||
|
||||
We have added support for AVX512 which results in an up to 8x performance
|
||||
improvement over AVX2 (3.0 GHz Xeon Platinum 8124M CPU):
|
||||
|
||||
```
|
||||
$ benchcmp avx2.txt avx512.txt
|
||||
benchmark AVX2 MB/s AVX512 MB/s speedup
|
||||
BenchmarkHash5M 448.62 3498.20 7.80x
|
||||
```
|
||||
|
||||
The original code was developed by Intel as part of
|
||||
the [multi-buffer crypto library](https://github.com/intel/intel-ipsec-mb) for
|
||||
IPSec or more specifically
|
||||
this [AVX512](https://github.com/intel/intel-ipsec-mb/blob/master/avx512/sha256_x16_avx512.asm)
|
||||
implementation. The key idea behind it is to process a total of 16 checksums in
|
||||
parallel by “transposing” 16 (independent) messages of 64 bytes between a total
|
||||
of 16 ZMM registers (each 64 bytes wide).
|
||||
|
||||
Transposing the input messages means that in order to take full advantage of the
|
||||
speedup you need to have a (server) workload where multiple threads are doing
|
||||
SHA256 calculations in parallel. Unfortunately for this algorithm it is not
|
||||
possible for two message blocks processed in parallel to be dependent on one
|
||||
another — because then the (interim) result of the first part of the message has
|
||||
to be an input into the processing of the second part of the message.
|
||||
|
||||
Whereas the original Intel C implementation requires some sort of explicit
|
||||
scheduling of messages to be processed in parallel, for Golang it makes sense to
|
||||
take advantage of channels in order to group messages together and use channels
|
||||
as well for sending back the results (thereby effectively decoupling the
|
||||
calculations). We have implemented a fairly simple scheduling mechanism that
|
||||
seems to work well in practice.
|
||||
|
||||
Due to this different way of scheduling, we decided to use an explicit method to
|
||||
instantiate the AVX512 version. Essentially one or more AVX512 processing
|
||||
servers ([
|
||||
`Avx512Server`](https://github.com/minio/sha256-simd/blob/master/sha256blockAvx512_amd64.go#L294))
|
||||
have to be created whereby each server can hash over 3 GB/s on a single core. An
|
||||
`hash.Hash` object ([
|
||||
`Avx512Digest`](https://github.com/minio/sha256-simd/blob/master/sha256blockAvx512_amd64.go#L45))
|
||||
is then instantiated using one of these servers and used in the regular fashion:
|
||||
|
||||
```go
|
||||
import "mleku.dev/pkg/sha256"
|
||||
|
||||
func main() {
|
||||
server := sha256.NewAvx512Server()
|
||||
h512 := sha256.NewAvx512(server)
|
||||
h512.Write(fileBlock)
|
||||
digest := h512.Sum([]byte{})
|
||||
}
|
||||
```
|
||||
|
||||
Note that, because of the scheduling overhead, for small messages (< 1 MB) you
|
||||
will be better off using the regular SHA256 hashing (but those are typically not
|
||||
performance critical anyway). Some other tips to get the best performance:
|
||||
|
||||
- Have many go routines doing SHA256 calculations in parallel.
|
||||
- Try to Write() messages in multiples of 64 bytes.
|
||||
- Try to keep the overall length of messages to a roughly similar size ie. 5
|
||||
MB (this way all 16 ‘lanes’ in the AVX512 computations are contributing as
|
||||
much as possible).
|
||||
|
||||
More detailed information can be found in
|
||||
this [blog](https://blog.minio.io/accelerate-sha256-up-to-8x-over-3-gb-s-per-core-with-avx512-a0b1d64f78f)
|
||||
post including scaling across cores.
|
||||
|
||||
## Drop-In Replacement
|
||||
|
||||
The following code snippet shows how you can use `github.com/minio/sha256-simd`.
|
||||
This will automatically select the fastest method for the architecture on which
|
||||
it will be executed.
|
||||
|
||||
```go
|
||||
import "crypto.orly/sha256"
|
||||
|
||||
func main() {
|
||||
...
|
||||
shaWriter := sha256.New()
|
||||
io.Copy(shaWriter, file)
|
||||
...
|
||||
}
|
||||
```
|
||||
|
||||
## Performance
|
||||
|
||||
Below is the speed in MB/s for a single core (ranked fast to slow) for blocks
|
||||
larger than 1 MB.
|
||||
|
||||
| Processor | SIMD | Speed (MB/s) |
|
||||
| --------------------------------- | ------- | -----------: |
|
||||
| 3.0 GHz Intel Xeon Platinum 8124M | AVX512 | 3498 |
|
||||
| 3.7 GHz AMD Ryzen 7 2700X | SHA Ext | 1979 |
|
||||
| 1.2 GHz ARM Cortex-A53 | ARM64 | 638 |
|
||||
|
||||
## asm2plan9s
|
||||
|
||||
In order to be able to work more easily with AVX512/AVX2 instructions, a
|
||||
separate tool was developed to convert SIMD instructions into the corresponding
|
||||
BYTE sequence as accepted by Go assembly.
|
||||
See [asm2plan9s](https://github.com/minio/asm2plan9s) for more information.
|
||||
|
||||
## Why and benefits
|
||||
|
||||
One of the most performance sensitive parts of
|
||||
the [Minio](https://github.com/minio/minio) object storage server is related to
|
||||
SHA256 hash sums calculations. For instance during multi part uploads each part
|
||||
that is uploaded needs to be verified for data integrity by the server.
|
||||
|
||||
Other applications that can benefit from enhanced SHA256 performance are
|
||||
deduplication in storage systems, intrusion detection, version control systems,
|
||||
integrity checking, etc.
|
||||
|
||||
## ARM SHA Extensions
|
||||
|
||||
The 64-bit ARMv8 core has introduced new instructions for SHA1 and SHA2
|
||||
acceleration as part of
|
||||
the [Cryptography Extensions](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0501f/CHDFJBCJ.html).
|
||||
Below you can see a small excerpt highlighting one of the rounds as is done for
|
||||
the SHA256 calculation process (for full code
|
||||
see [sha256block_arm64.s](https://github.com/minio/sha256-simd/blob/master/sha256block_arm64.s)).
|
||||
|
||||
```
|
||||
sha256h q2, q3, v9.4s
|
||||
sha256h2 q3, q4, v9.4s
|
||||
sha256su0 v5.4s, v6.4s
|
||||
rev32 v8.16b, v8.16b
|
||||
add v9.4s, v7.4s, v18.4s
|
||||
mov v4.16b, v2.16b
|
||||
sha256h q2, q3, v10.4s
|
||||
sha256h2 q3, q4, v10.4s
|
||||
sha256su0 v6.4s, v7.4s
|
||||
sha256su1 v5.4s, v7.4s, v8.4s
|
||||
```
|
||||
|
||||
### Detailed benchmarks
|
||||
|
||||
Benchmarks generated on a 1.2 Ghz Quad-Core ARM Cortex A53
|
||||
equipped [Pine64](https://www.pine64.com/).
|
||||
|
||||
```
|
||||
minio@minio-arm:$ benchcmp golang.txt arm64.txt
|
||||
benchmark golang arm64 speedup
|
||||
BenchmarkHash8Bytes-4 0.68 MB/s 5.70 MB/s 8.38x
|
||||
BenchmarkHash1K-4 5.65 MB/s 326.30 MB/s 57.75x
|
||||
BenchmarkHash8K-4 6.00 MB/s 570.63 MB/s 95.11x
|
||||
BenchmarkHash1M-4 6.05 MB/s 638.23 MB/s 105.49x
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
Released under the Apache License v2.0. You can find the complete text in the
|
||||
file LICENSE.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome, please send PRs for any enhancements.
|
||||
@@ -1,55 +0,0 @@
|
||||
// Minio Cloud Storage, (C) 2021 Minio, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
package sha256
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"github.com/klauspost/cpuid/v2"
|
||||
"io/ioutil"
|
||||
"runtime"
|
||||
)
|
||||
|
||||
var (
|
||||
hasIntelSha = runtime.GOARCH == "amd64" && cpuid.CPU.Supports(
|
||||
cpuid.SHA, cpuid.SSSE3,
|
||||
cpuid.SSE4,
|
||||
)
|
||||
hasAvx512 = cpuid.CPU.Supports(
|
||||
cpuid.AVX512F, cpuid.AVX512DQ, cpuid.AVX512BW,
|
||||
cpuid.AVX512VL,
|
||||
)
|
||||
)
|
||||
|
||||
func hasArmSha2() bool {
|
||||
if cpuid.CPU.Has(cpuid.SHA2) {
|
||||
return true
|
||||
}
|
||||
if runtime.GOARCH != "arm64" || runtime.GOOS != "linux" {
|
||||
return false
|
||||
}
|
||||
|
||||
// Fall back to hacky cpuinfo parsing...
|
||||
const procCPUInfo = "/proc/cpuinfo"
|
||||
|
||||
// Feature to check for.
|
||||
const sha256Feature = "sha2"
|
||||
|
||||
cpuInfo, err := ioutil.ReadFile(procCPUInfo)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
return bytes.Contains(cpuInfo, []byte(sha256Feature))
|
||||
}
|
||||
@@ -1,6 +0,0 @@
|
||||
// Package sha256 is taken from github.com/minio/sha256-simd, implementing,
|
||||
// where available, an accelerated SIMD implementation of sha256.
|
||||
//
|
||||
// This package should be updated against the upstream version from time to
|
||||
// time.
|
||||
package sha256
|
||||
@@ -1,470 +0,0 @@
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package sha256
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"hash"
|
||||
)
|
||||
|
||||
// Size - The size of a SHA256 checksum in bytes.
|
||||
const Size = 32
|
||||
|
||||
// BlockSize - The blocksize of SHA256 in bytes.
|
||||
const BlockSize = 64
|
||||
|
||||
const (
|
||||
chunk = BlockSize
|
||||
init0 = 0x6A09E667
|
||||
init1 = 0xBB67AE85
|
||||
init2 = 0x3C6EF372
|
||||
init3 = 0xA54FF53A
|
||||
init4 = 0x510E527F
|
||||
init5 = 0x9B05688C
|
||||
init6 = 0x1F83D9AB
|
||||
init7 = 0x5BE0CD19
|
||||
)
|
||||
|
||||
// digest represents the partial evaluation of a checksum.
|
||||
type digest struct {
|
||||
h [8]uint32
|
||||
x [chunk]byte
|
||||
nx int
|
||||
len uint64
|
||||
}
|
||||
|
||||
// Reset digest back to default
|
||||
func (d *digest) Reset() {
|
||||
d.h[0] = init0
|
||||
d.h[1] = init1
|
||||
d.h[2] = init2
|
||||
d.h[3] = init3
|
||||
d.h[4] = init4
|
||||
d.h[5] = init5
|
||||
d.h[6] = init6
|
||||
d.h[7] = init7
|
||||
d.nx = 0
|
||||
d.len = 0
|
||||
}
|
||||
|
||||
type blockfuncType int
|
||||
|
||||
const (
|
||||
blockfuncStdlib blockfuncType = iota
|
||||
blockfuncIntelSha
|
||||
blockfuncArmSha2
|
||||
blockfuncForceGeneric = -1
|
||||
)
|
||||
|
||||
var blockfunc blockfuncType
|
||||
|
||||
func init() {
|
||||
switch {
|
||||
case hasIntelSha:
|
||||
blockfunc = blockfuncIntelSha
|
||||
case hasArmSha2():
|
||||
blockfunc = blockfuncArmSha2
|
||||
}
|
||||
}
|
||||
|
||||
// New returns a new hash.Hash computing the SHA256 checksum.
|
||||
func New() hash.Hash {
|
||||
if blockfunc == blockfuncStdlib {
|
||||
// Fallback to the standard golang implementation
|
||||
// if no features were found.
|
||||
return sha256.New()
|
||||
}
|
||||
|
||||
d := new(digest)
|
||||
d.Reset()
|
||||
return d
|
||||
}
|
||||
|
||||
// Sum256 - single caller sha256 helper
|
||||
func Sum256(data []byte) (result [Size]byte) {
|
||||
var d digest
|
||||
d.Reset()
|
||||
d.Write(data)
|
||||
result = d.checkSum()
|
||||
return
|
||||
}
|
||||
|
||||
// Return size of checksum
|
||||
func (d *digest) Size() int { return Size }
|
||||
|
||||
// Return blocksize of checksum
|
||||
func (d *digest) BlockSize() int { return BlockSize }
|
||||
|
||||
// Write to digest
|
||||
func (d *digest) Write(p []byte) (nn int, err error) {
|
||||
nn = len(p)
|
||||
d.len += uint64(nn)
|
||||
if d.nx > 0 {
|
||||
n := copy(d.x[d.nx:], p)
|
||||
d.nx += n
|
||||
if d.nx == chunk {
|
||||
block(d, d.x[:])
|
||||
d.nx = 0
|
||||
}
|
||||
p = p[n:]
|
||||
}
|
||||
if len(p) >= chunk {
|
||||
n := len(p) &^ (chunk - 1)
|
||||
block(d, p[:n])
|
||||
p = p[n:]
|
||||
}
|
||||
if len(p) > 0 {
|
||||
d.nx = copy(d.x[:], p)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Return sha256 sum in bytes
|
||||
func (d *digest) Sum(in []byte) []byte {
|
||||
// Make a copy of d0 so that caller can keep writing and summing.
|
||||
d0 := *d
|
||||
hash := d0.checkSum()
|
||||
return append(in, hash[:]...)
|
||||
}
|
||||
|
||||
// Intermediate checksum function
|
||||
func (d *digest) checkSum() (digest [Size]byte) {
|
||||
n := d.nx
|
||||
|
||||
var k [64]byte
|
||||
copy(k[:], d.x[:n])
|
||||
|
||||
k[n] = 0x80
|
||||
|
||||
if n >= 56 {
|
||||
block(d, k[:])
|
||||
|
||||
// clear block buffer - go compiles this to optimal 1x xorps + 4x movups
|
||||
// unfortunately expressing this more succinctly results in much worse code
|
||||
k[0] = 0
|
||||
k[1] = 0
|
||||
k[2] = 0
|
||||
k[3] = 0
|
||||
k[4] = 0
|
||||
k[5] = 0
|
||||
k[6] = 0
|
||||
k[7] = 0
|
||||
k[8] = 0
|
||||
k[9] = 0
|
||||
k[10] = 0
|
||||
k[11] = 0
|
||||
k[12] = 0
|
||||
k[13] = 0
|
||||
k[14] = 0
|
||||
k[15] = 0
|
||||
k[16] = 0
|
||||
k[17] = 0
|
||||
k[18] = 0
|
||||
k[19] = 0
|
||||
k[20] = 0
|
||||
k[21] = 0
|
||||
k[22] = 0
|
||||
k[23] = 0
|
||||
k[24] = 0
|
||||
k[25] = 0
|
||||
k[26] = 0
|
||||
k[27] = 0
|
||||
k[28] = 0
|
||||
k[29] = 0
|
||||
k[30] = 0
|
||||
k[31] = 0
|
||||
k[32] = 0
|
||||
k[33] = 0
|
||||
k[34] = 0
|
||||
k[35] = 0
|
||||
k[36] = 0
|
||||
k[37] = 0
|
||||
k[38] = 0
|
||||
k[39] = 0
|
||||
k[40] = 0
|
||||
k[41] = 0
|
||||
k[42] = 0
|
||||
k[43] = 0
|
||||
k[44] = 0
|
||||
k[45] = 0
|
||||
k[46] = 0
|
||||
k[47] = 0
|
||||
k[48] = 0
|
||||
k[49] = 0
|
||||
k[50] = 0
|
||||
k[51] = 0
|
||||
k[52] = 0
|
||||
k[53] = 0
|
||||
k[54] = 0
|
||||
k[55] = 0
|
||||
k[56] = 0
|
||||
k[57] = 0
|
||||
k[58] = 0
|
||||
k[59] = 0
|
||||
k[60] = 0
|
||||
k[61] = 0
|
||||
k[62] = 0
|
||||
k[63] = 0
|
||||
}
|
||||
binary.BigEndian.PutUint64(k[56:64], uint64(d.len)<<3)
|
||||
block(d, k[:])
|
||||
|
||||
{
|
||||
const i = 0
|
||||
binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
|
||||
}
|
||||
{
|
||||
const i = 1
|
||||
binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
|
||||
}
|
||||
{
|
||||
const i = 2
|
||||
binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
|
||||
}
|
||||
{
|
||||
const i = 3
|
||||
binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
|
||||
}
|
||||
{
|
||||
const i = 4
|
||||
binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
|
||||
}
|
||||
{
|
||||
const i = 5
|
||||
binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
|
||||
}
|
||||
{
|
||||
const i = 6
|
||||
binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
|
||||
}
|
||||
{
|
||||
const i = 7
|
||||
binary.BigEndian.PutUint32(digest[i*4:i*4+4], d.h[i])
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func block(dig *digest, p []byte) {
|
||||
if blockfunc == blockfuncIntelSha {
|
||||
blockIntelShaGo(dig, p)
|
||||
} else if blockfunc == blockfuncArmSha2 {
|
||||
blockArmSha2Go(dig, p)
|
||||
} else {
|
||||
blockGeneric(dig, p)
|
||||
}
|
||||
}
|
||||
|
||||
func blockGeneric(dig *digest, p []byte) {
|
||||
var w [64]uint32
|
||||
h0, h1, h2, h3, h4, h5, h6, h7 := dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]
|
||||
for len(p) >= chunk {
|
||||
// Can interlace the computation of w with the
|
||||
// rounds below if needed for speed.
|
||||
for i := 0; i < 16; i++ {
|
||||
j := i * 4
|
||||
w[i] = uint32(p[j])<<24 | uint32(p[j+1])<<16 | uint32(p[j+2])<<8 | uint32(p[j+3])
|
||||
}
|
||||
for i := 16; i < 64; i++ {
|
||||
v1 := w[i-2]
|
||||
t1 := (v1>>17 | v1<<(32-17)) ^ (v1>>19 | v1<<(32-19)) ^ (v1 >> 10)
|
||||
v2 := w[i-15]
|
||||
t2 := (v2>>7 | v2<<(32-7)) ^ (v2>>18 | v2<<(32-18)) ^ (v2 >> 3)
|
||||
w[i] = t1 + w[i-7] + t2 + w[i-16]
|
||||
}
|
||||
|
||||
a, b, c, d, e, f, g, h := h0, h1, h2, h3, h4, h5, h6, h7
|
||||
|
||||
for i := 0; i < 64; i++ {
|
||||
t1 := h + ((e>>6 | e<<(32-6)) ^ (e>>11 | e<<(32-11)) ^ (e>>25 | e<<(32-25))) + ((e & f) ^ (^e & g)) + _K[i] + w[i]
|
||||
|
||||
t2 := ((a>>2 | a<<(32-2)) ^ (a>>13 | a<<(32-13)) ^ (a>>22 | a<<(32-22))) + ((a & b) ^ (a & c) ^ (b & c))
|
||||
|
||||
h = g
|
||||
g = f
|
||||
f = e
|
||||
e = d + t1
|
||||
d = c
|
||||
c = b
|
||||
b = a
|
||||
a = t1 + t2
|
||||
}
|
||||
|
||||
h0 += a
|
||||
h1 += b
|
||||
h2 += c
|
||||
h3 += d
|
||||
h4 += e
|
||||
h5 += f
|
||||
h6 += g
|
||||
h7 += h
|
||||
|
||||
p = p[chunk:]
|
||||
}
|
||||
|
||||
dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h0, h1, h2, h3, h4, h5, h6, h7
|
||||
}
|
||||
|
||||
var _K = []uint32{
|
||||
0x428a2f98,
|
||||
0x71374491,
|
||||
0xb5c0fbcf,
|
||||
0xe9b5dba5,
|
||||
0x3956c25b,
|
||||
0x59f111f1,
|
||||
0x923f82a4,
|
||||
0xab1c5ed5,
|
||||
0xd807aa98,
|
||||
0x12835b01,
|
||||
0x243185be,
|
||||
0x550c7dc3,
|
||||
0x72be5d74,
|
||||
0x80deb1fe,
|
||||
0x9bdc06a7,
|
||||
0xc19bf174,
|
||||
0xe49b69c1,
|
||||
0xefbe4786,
|
||||
0x0fc19dc6,
|
||||
0x240ca1cc,
|
||||
0x2de92c6f,
|
||||
0x4a7484aa,
|
||||
0x5cb0a9dc,
|
||||
0x76f988da,
|
||||
0x983e5152,
|
||||
0xa831c66d,
|
||||
0xb00327c8,
|
||||
0xbf597fc7,
|
||||
0xc6e00bf3,
|
||||
0xd5a79147,
|
||||
0x06ca6351,
|
||||
0x14292967,
|
||||
0x27b70a85,
|
||||
0x2e1b2138,
|
||||
0x4d2c6dfc,
|
||||
0x53380d13,
|
||||
0x650a7354,
|
||||
0x766a0abb,
|
||||
0x81c2c92e,
|
||||
0x92722c85,
|
||||
0xa2bfe8a1,
|
||||
0xa81a664b,
|
||||
0xc24b8b70,
|
||||
0xc76c51a3,
|
||||
0xd192e819,
|
||||
0xd6990624,
|
||||
0xf40e3585,
|
||||
0x106aa070,
|
||||
0x19a4c116,
|
||||
0x1e376c08,
|
||||
0x2748774c,
|
||||
0x34b0bcb5,
|
||||
0x391c0cb3,
|
||||
0x4ed8aa4a,
|
||||
0x5b9cca4f,
|
||||
0x682e6ff3,
|
||||
0x748f82ee,
|
||||
0x78a5636f,
|
||||
0x84c87814,
|
||||
0x8cc70208,
|
||||
0x90befffa,
|
||||
0xa4506ceb,
|
||||
0xbef9a3f7,
|
||||
0xc67178f2,
|
||||
}
|
||||
|
||||
const (
|
||||
magic256 = "sha\x03"
|
||||
marshaledSize = len(magic256) + 8*4 + chunk + 8
|
||||
)
|
||||
|
||||
func (d *digest) MarshalBinary() ([]byte, error) {
|
||||
b := make([]byte, 0, marshaledSize)
|
||||
b = append(b, magic256...)
|
||||
b = appendUint32(b, d.h[0])
|
||||
b = appendUint32(b, d.h[1])
|
||||
b = appendUint32(b, d.h[2])
|
||||
b = appendUint32(b, d.h[3])
|
||||
b = appendUint32(b, d.h[4])
|
||||
b = appendUint32(b, d.h[5])
|
||||
b = appendUint32(b, d.h[6])
|
||||
b = appendUint32(b, d.h[7])
|
||||
b = append(b, d.x[:d.nx]...)
|
||||
b = b[:len(b)+len(d.x)-d.nx] // already zero
|
||||
b = appendUint64(b, d.len)
|
||||
return b, nil
|
||||
}
|
||||
|
||||
func (d *digest) UnmarshalBinary(b []byte) error {
|
||||
if len(b) < len(magic256) || string(b[:len(magic256)]) != magic256 {
|
||||
return errors.New("next.orly.dev/pkg/crypto/sha256: invalid hash state identifier")
|
||||
}
|
||||
if len(b) != marshaledSize {
|
||||
return errors.New("next.orly.dev/pkg/crypto/sha256: invalid hash state size")
|
||||
}
|
||||
b = b[len(magic256):]
|
||||
b, d.h[0] = consumeUint32(b)
|
||||
b, d.h[1] = consumeUint32(b)
|
||||
b, d.h[2] = consumeUint32(b)
|
||||
b, d.h[3] = consumeUint32(b)
|
||||
b, d.h[4] = consumeUint32(b)
|
||||
b, d.h[5] = consumeUint32(b)
|
||||
b, d.h[6] = consumeUint32(b)
|
||||
b, d.h[7] = consumeUint32(b)
|
||||
b = b[copy(d.x[:], b):]
|
||||
b, d.len = consumeUint64(b)
|
||||
d.nx = int(d.len % chunk)
|
||||
return nil
|
||||
}
|
||||
|
||||
func appendUint32(b []byte, v uint32) []byte {
|
||||
return append(
|
||||
b,
|
||||
byte(v>>24),
|
||||
byte(v>>16),
|
||||
byte(v>>8),
|
||||
byte(v),
|
||||
)
|
||||
}
|
||||
|
||||
func appendUint64(b []byte, v uint64) []byte {
|
||||
return append(
|
||||
b,
|
||||
byte(v>>56),
|
||||
byte(v>>48),
|
||||
byte(v>>40),
|
||||
byte(v>>32),
|
||||
byte(v>>24),
|
||||
byte(v>>16),
|
||||
byte(v>>8),
|
||||
byte(v),
|
||||
)
|
||||
}
|
||||
|
||||
func consumeUint64(b []byte) ([]byte, uint64) {
|
||||
_ = b[7]
|
||||
x := uint64(b[7]) | uint64(b[6])<<8 | uint64(b[5])<<16 | uint64(b[4])<<24 |
|
||||
uint64(b[3])<<32 | uint64(b[2])<<40 | uint64(b[1])<<48 | uint64(b[0])<<56
|
||||
return b[8:], x
|
||||
}
|
||||
|
||||
func consumeUint32(b []byte) ([]byte, uint32) {
|
||||
_ = b[3]
|
||||
x := uint32(b[3]) | uint32(b[2])<<8 | uint32(b[1])<<16 | uint32(b[0])<<24
|
||||
return b[4:], x
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,686 +0,0 @@
|
||||
|
||||
// 16x Parallel implementation of SHA256 for AVX512
|
||||
|
||||
//
|
||||
// Minio Cloud Storage, (C) 2017 Minio, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
//
|
||||
// This code is based on the Intel Multi-Buffer Crypto for IPSec library
|
||||
// and more specifically the following implementation:
|
||||
// https://github.com/intel/intel-ipsec-mb/blob/master/avx512/sha256_x16_avx512.asm
|
||||
//
|
||||
// For Golang it has been converted into Plan 9 assembly with the help of
|
||||
// github.com/minio/asm2plan9s to assemble the AVX512 instructions
|
||||
//
|
||||
|
||||
// Copyright (c) 2017, Intel Corporation
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are met:
|
||||
//
|
||||
// * Redistributions of source code must retain the above copyright notice,
|
||||
// this list of conditions and the following disclaimer.
|
||||
// * Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
// * Neither the name of Intel Corporation nor the names of its contributors
|
||||
// may be used to endorse or promote products derived from this software
|
||||
// without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
|
||||
// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define SHA256_DIGEST_ROW_SIZE 64
|
||||
|
||||
// arg1
|
||||
#define STATE rdi
|
||||
#define STATE_P9 DI
|
||||
// arg2
|
||||
#define INP_SIZE rsi
|
||||
#define INP_SIZE_P9 SI
|
||||
|
||||
#define IDX rcx
|
||||
#define TBL rdx
|
||||
#define TBL_P9 DX
|
||||
|
||||
#define INPUT rax
|
||||
#define INPUT_P9 AX
|
||||
|
||||
#define inp0 r9
|
||||
#define SCRATCH_P9 R12
|
||||
#define SCRATCH r12
|
||||
#define maskp r13
|
||||
#define MASKP_P9 R13
|
||||
#define mask r14
|
||||
#define MASK_P9 R14
|
||||
|
||||
#define A zmm0
|
||||
#define B zmm1
|
||||
#define C zmm2
|
||||
#define D zmm3
|
||||
#define E zmm4
|
||||
#define F zmm5
|
||||
#define G zmm6
|
||||
#define H zmm7
|
||||
#define T1 zmm8
|
||||
#define TMP0 zmm9
|
||||
#define TMP1 zmm10
|
||||
#define TMP2 zmm11
|
||||
#define TMP3 zmm12
|
||||
#define TMP4 zmm13
|
||||
#define TMP5 zmm14
|
||||
#define TMP6 zmm15
|
||||
|
||||
#define W0 zmm16
|
||||
#define W1 zmm17
|
||||
#define W2 zmm18
|
||||
#define W3 zmm19
|
||||
#define W4 zmm20
|
||||
#define W5 zmm21
|
||||
#define W6 zmm22
|
||||
#define W7 zmm23
|
||||
#define W8 zmm24
|
||||
#define W9 zmm25
|
||||
#define W10 zmm26
|
||||
#define W11 zmm27
|
||||
#define W12 zmm28
|
||||
#define W13 zmm29
|
||||
#define W14 zmm30
|
||||
#define W15 zmm31
|
||||
|
||||
|
||||
#define TRANSPOSE16(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7, _r8, _r9, _r10, _r11, _r12, _r13, _r14, _r15, _t0, _t1) \
|
||||
\
|
||||
\ // input r0 = {a15 a14 a13 a12 a11 a10 a9 a8 a7 a6 a5 a4 a3 a2 a1 a0}
|
||||
\ // r1 = {b15 b14 b13 b12 b11 b10 b9 b8 b7 b6 b5 b4 b3 b2 b1 b0}
|
||||
\ // r2 = {c15 c14 c13 c12 c11 c10 c9 c8 c7 c6 c5 c4 c3 c2 c1 c0}
|
||||
\ // r3 = {d15 d14 d13 d12 d11 d10 d9 d8 d7 d6 d5 d4 d3 d2 d1 d0}
|
||||
\ // r4 = {e15 e14 e13 e12 e11 e10 e9 e8 e7 e6 e5 e4 e3 e2 e1 e0}
|
||||
\ // r5 = {f15 f14 f13 f12 f11 f10 f9 f8 f7 f6 f5 f4 f3 f2 f1 f0}
|
||||
\ // r6 = {g15 g14 g13 g12 g11 g10 g9 g8 g7 g6 g5 g4 g3 g2 g1 g0}
|
||||
\ // r7 = {h15 h14 h13 h12 h11 h10 h9 h8 h7 h6 h5 h4 h3 h2 h1 h0}
|
||||
\ // r8 = {i15 i14 i13 i12 i11 i10 i9 i8 i7 i6 i5 i4 i3 i2 i1 i0}
|
||||
\ // r9 = {j15 j14 j13 j12 j11 j10 j9 j8 j7 j6 j5 j4 j3 j2 j1 j0}
|
||||
\ // r10 = {k15 k14 k13 k12 k11 k10 k9 k8 k7 k6 k5 k4 k3 k2 k1 k0}
|
||||
\ // r11 = {l15 l14 l13 l12 l11 l10 l9 l8 l7 l6 l5 l4 l3 l2 l1 l0}
|
||||
\ // r12 = {m15 m14 m13 m12 m11 m10 m9 m8 m7 m6 m5 m4 m3 m2 m1 m0}
|
||||
\ // r13 = {n15 n14 n13 n12 n11 n10 n9 n8 n7 n6 n5 n4 n3 n2 n1 n0}
|
||||
\ // r14 = {o15 o14 o13 o12 o11 o10 o9 o8 o7 o6 o5 o4 o3 o2 o1 o0}
|
||||
\ // r15 = {p15 p14 p13 p12 p11 p10 p9 p8 p7 p6 p5 p4 p3 p2 p1 p0}
|
||||
\
|
||||
\ // output r0 = { p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
|
||||
\ // r1 = { p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
|
||||
\ // r2 = { p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
|
||||
\ // r3 = { p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
|
||||
\ // r4 = { p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
|
||||
\ // r5 = { p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
|
||||
\ // r6 = { p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
|
||||
\ // r7 = { p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
|
||||
\ // r8 = { p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
|
||||
\ // r9 = { p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
|
||||
\ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
|
||||
\ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
|
||||
\ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
|
||||
\ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
|
||||
\ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
|
||||
\ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
|
||||
\
|
||||
\ // process top half
|
||||
vshufps _t0, _r0, _r1, 0x44 \ // t0 = {b13 b12 a13 a12 b9 b8 a9 a8 b5 b4 a5 a4 b1 b0 a1 a0}
|
||||
vshufps _r0, _r0, _r1, 0xEE \ // r0 = {b15 b14 a15 a14 b11 b10 a11 a10 b7 b6 a7 a6 b3 b2 a3 a2}
|
||||
vshufps _t1, _r2, _r3, 0x44 \ // t1 = {d13 d12 c13 c12 d9 d8 c9 c8 d5 d4 c5 c4 d1 d0 c1 c0}
|
||||
vshufps _r2, _r2, _r3, 0xEE \ // r2 = {d15 d14 c15 c14 d11 d10 c11 c10 d7 d6 c7 c6 d3 d2 c3 c2}
|
||||
\
|
||||
vshufps _r3, _t0, _t1, 0xDD \ // r3 = {d13 c13 b13 a13 d9 c9 b9 a9 d5 c5 b5 a5 d1 c1 b1 a1}
|
||||
vshufps _r1, _r0, _r2, 0x88 \ // r1 = {d14 c14 b14 a14 d10 c10 b10 a10 d6 c6 b6 a6 d2 c2 b2 a2}
|
||||
vshufps _r0, _r0, _r2, 0xDD \ // r0 = {d15 c15 b15 a15 d11 c11 b11 a11 d7 c7 b7 a7 d3 c3 b3 a3}
|
||||
vshufps _t0, _t0, _t1, 0x88 \ // t0 = {d12 c12 b12 a12 d8 c8 b8 a8 d4 c4 b4 a4 d0 c0 b0 a0}
|
||||
\
|
||||
\ // use r2 in place of t0
|
||||
vshufps _r2, _r4, _r5, 0x44 \ // r2 = {f13 f12 e13 e12 f9 f8 e9 e8 f5 f4 e5 e4 f1 f0 e1 e0}
|
||||
vshufps _r4, _r4, _r5, 0xEE \ // r4 = {f15 f14 e15 e14 f11 f10 e11 e10 f7 f6 e7 e6 f3 f2 e3 e2}
|
||||
vshufps _t1, _r6, _r7, 0x44 \ // t1 = {h13 h12 g13 g12 h9 h8 g9 g8 h5 h4 g5 g4 h1 h0 g1 g0}
|
||||
vshufps _r6, _r6, _r7, 0xEE \ // r6 = {h15 h14 g15 g14 h11 h10 g11 g10 h7 h6 g7 g6 h3 h2 g3 g2}
|
||||
\
|
||||
vshufps _r7, _r2, _t1, 0xDD \ // r7 = {h13 g13 f13 e13 h9 g9 f9 e9 h5 g5 f5 e5 h1 g1 f1 e1}
|
||||
vshufps _r5, _r4, _r6, 0x88 \ // r5 = {h14 g14 f14 e14 h10 g10 f10 e10 h6 g6 f6 e6 h2 g2 f2 e2}
|
||||
vshufps _r4, _r4, _r6, 0xDD \ // r4 = {h15 g15 f15 e15 h11 g11 f11 e11 h7 g7 f7 e7 h3 g3 f3 e3}
|
||||
vshufps _r2, _r2, _t1, 0x88 \ // r2 = {h12 g12 f12 e12 h8 g8 f8 e8 h4 g4 f4 e4 h0 g0 f0 e0}
|
||||
\
|
||||
\ // use r6 in place of t0
|
||||
vshufps _r6, _r8, _r9, 0x44 \ // r6 = {j13 j12 i13 i12 j9 j8 i9 i8 j5 j4 i5 i4 j1 j0 i1 i0}
|
||||
vshufps _r8, _r8, _r9, 0xEE \ // r8 = {j15 j14 i15 i14 j11 j10 i11 i10 j7 j6 i7 i6 j3 j2 i3 i2}
|
||||
vshufps _t1, _r10, _r11, 0x44 \ // t1 = {l13 l12 k13 k12 l9 l8 k9 k8 l5 l4 k5 k4 l1 l0 k1 k0}
|
||||
vshufps _r10, _r10, _r11, 0xEE \ // r10 = {l15 l14 k15 k14 l11 l10 k11 k10 l7 l6 k7 k6 l3 l2 k3 k2}
|
||||
\
|
||||
vshufps _r11, _r6, _t1, 0xDD \ // r11 = {l13 k13 j13 113 l9 k9 j9 i9 l5 k5 j5 i5 l1 k1 j1 i1}
|
||||
vshufps _r9, _r8, _r10, 0x88 \ // r9 = {l14 k14 j14 114 l10 k10 j10 i10 l6 k6 j6 i6 l2 k2 j2 i2}
|
||||
vshufps _r8, _r8, _r10, 0xDD \ // r8 = {l15 k15 j15 115 l11 k11 j11 i11 l7 k7 j7 i7 l3 k3 j3 i3}
|
||||
vshufps _r6, _r6, _t1, 0x88 \ // r6 = {l12 k12 j12 112 l8 k8 j8 i8 l4 k4 j4 i4 l0 k0 j0 i0}
|
||||
\
|
||||
\ // use r10 in place of t0
|
||||
vshufps _r10, _r12, _r13, 0x44 \ // r10 = {n13 n12 m13 m12 n9 n8 m9 m8 n5 n4 m5 m4 n1 n0 a1 m0}
|
||||
vshufps _r12, _r12, _r13, 0xEE \ // r12 = {n15 n14 m15 m14 n11 n10 m11 m10 n7 n6 m7 m6 n3 n2 a3 m2}
|
||||
vshufps _t1, _r14, _r15, 0x44 \ // t1 = {p13 p12 013 012 p9 p8 09 08 p5 p4 05 04 p1 p0 01 00}
|
||||
vshufps _r14, _r14, _r15, 0xEE \ // r14 = {p15 p14 015 014 p11 p10 011 010 p7 p6 07 06 p3 p2 03 02}
|
||||
\
|
||||
vshufps _r15, _r10, _t1, 0xDD \ // r15 = {p13 013 n13 m13 p9 09 n9 m9 p5 05 n5 m5 p1 01 n1 m1}
|
||||
vshufps _r13, _r12, _r14, 0x88 \ // r13 = {p14 014 n14 m14 p10 010 n10 m10 p6 06 n6 m6 p2 02 n2 m2}
|
||||
vshufps _r12, _r12, _r14, 0xDD \ // r12 = {p15 015 n15 m15 p11 011 n11 m11 p7 07 n7 m7 p3 03 n3 m3}
|
||||
vshufps _r10, _r10, _t1, 0x88 \ // r10 = {p12 012 n12 m12 p8 08 n8 m8 p4 04 n4 m4 p0 00 n0 m0}
|
||||
\
|
||||
\ // At this point, the registers that contain interesting data are:
|
||||
\ // t0, r3, r1, r0, r2, r7, r5, r4, r6, r11, r9, r8, r10, r15, r13, r12
|
||||
\ // Can use t1 and r14 as scratch registers
|
||||
LEAQ PSHUFFLE_TRANSPOSE16_MASK1<>(SB), BX \
|
||||
LEAQ PSHUFFLE_TRANSPOSE16_MASK2<>(SB), R8 \
|
||||
\
|
||||
vmovdqu32 _r14, [rbx] \
|
||||
vpermi2q _r14, _t0, _r2 \ // r14 = {h8 g8 f8 e8 d8 c8 b8 a8 h0 g0 f0 e0 d0 c0 b0 a0}
|
||||
vmovdqu32 _t1, [r8] \
|
||||
vpermi2q _t1, _t0, _r2 \ // t1 = {h12 g12 f12 e12 d12 c12 b12 a12 h4 g4 f4 e4 d4 c4 b4 a4}
|
||||
\
|
||||
vmovdqu32 _r2, [rbx] \
|
||||
vpermi2q _r2, _r3, _r7 \ // r2 = {h9 g9 f9 e9 d9 c9 b9 a9 h1 g1 f1 e1 d1 c1 b1 a1}
|
||||
vmovdqu32 _t0, [r8] \
|
||||
vpermi2q _t0, _r3, _r7 \ // t0 = {h13 g13 f13 e13 d13 c13 b13 a13 h5 g5 f5 e5 d5 c5 b5 a5}
|
||||
\
|
||||
vmovdqu32 _r3, [rbx] \
|
||||
vpermi2q _r3, _r1, _r5 \ // r3 = {h10 g10 f10 e10 d10 c10 b10 a10 h2 g2 f2 e2 d2 c2 b2 a2}
|
||||
vmovdqu32 _r7, [r8] \
|
||||
vpermi2q _r7, _r1, _r5 \ // r7 = {h14 g14 f14 e14 d14 c14 b14 a14 h6 g6 f6 e6 d6 c6 b6 a6}
|
||||
\
|
||||
vmovdqu32 _r1, [rbx] \
|
||||
vpermi2q _r1, _r0, _r4 \ // r1 = {h11 g11 f11 e11 d11 c11 b11 a11 h3 g3 f3 e3 d3 c3 b3 a3}
|
||||
vmovdqu32 _r5, [r8] \
|
||||
vpermi2q _r5, _r0, _r4 \ // r5 = {h15 g15 f15 e15 d15 c15 b15 a15 h7 g7 f7 e7 d7 c7 b7 a7}
|
||||
\
|
||||
vmovdqu32 _r0, [rbx] \
|
||||
vpermi2q _r0, _r6, _r10 \ // r0 = {p8 o8 n8 m8 l8 k8 j8 i8 p0 o0 n0 m0 l0 k0 j0 i0}
|
||||
vmovdqu32 _r4, [r8] \
|
||||
vpermi2q _r4, _r6, _r10 \ // r4 = {p12 o12 n12 m12 l12 k12 j12 i12 p4 o4 n4 m4 l4 k4 j4 i4}
|
||||
\
|
||||
vmovdqu32 _r6, [rbx] \
|
||||
vpermi2q _r6, _r11, _r15 \ // r6 = {p9 o9 n9 m9 l9 k9 j9 i9 p1 o1 n1 m1 l1 k1 j1 i1}
|
||||
vmovdqu32 _r10, [r8] \
|
||||
vpermi2q _r10, _r11, _r15 \ // r10 = {p13 o13 n13 m13 l13 k13 j13 i13 p5 o5 n5 m5 l5 k5 j5 i5}
|
||||
\
|
||||
vmovdqu32 _r11, [rbx] \
|
||||
vpermi2q _r11, _r9, _r13 \ // r11 = {p10 o10 n10 m10 l10 k10 j10 i10 p2 o2 n2 m2 l2 k2 j2 i2}
|
||||
vmovdqu32 _r15, [r8] \
|
||||
vpermi2q _r15, _r9, _r13 \ // r15 = {p14 o14 n14 m14 l14 k14 j14 i14 p6 o6 n6 m6 l6 k6 j6 i6}
|
||||
\
|
||||
vmovdqu32 _r9, [rbx] \
|
||||
vpermi2q _r9, _r8, _r12 \ // r9 = {p11 o11 n11 m11 l11 k11 j11 i11 p3 o3 n3 m3 l3 k3 j3 i3}
|
||||
vmovdqu32 _r13, [r8] \
|
||||
vpermi2q _r13, _r8, _r12 \ // r13 = {p15 o15 n15 m15 l15 k15 j15 i15 p7 o7 n7 m7 l7 k7 j7 i7}
|
||||
\
|
||||
\ // At this point r8 and r12 can be used as scratch registers
|
||||
vshuff64x2 _r8, _r14, _r0, 0xEE \ // r8 = {p8 o8 n8 m8 l8 k8 j8 i8 h8 g8 f8 e8 d8 c8 b8 a8}
|
||||
vshuff64x2 _r0, _r14, _r0, 0x44 \ // r0 = {p0 o0 n0 m0 l0 k0 j0 i0 h0 g0 f0 e0 d0 c0 b0 a0}
|
||||
\
|
||||
vshuff64x2 _r12, _t1, _r4, 0xEE \ // r12 = {p12 o12 n12 m12 l12 k12 j12 i12 h12 g12 f12 e12 d12 c12 b12 a12}
|
||||
vshuff64x2 _r4, _t1, _r4, 0x44 \ // r4 = {p4 o4 n4 m4 l4 k4 j4 i4 h4 g4 f4 e4 d4 c4 b4 a4}
|
||||
\
|
||||
vshuff64x2 _r14, _r7, _r15, 0xEE \ // r14 = {p14 o14 n14 m14 l14 k14 j14 i14 h14 g14 f14 e14 d14 c14 b14 a14}
|
||||
vshuff64x2 _t1, _r7, _r15, 0x44 \ // t1 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
|
||||
\
|
||||
vshuff64x2 _r15, _r5, _r13, 0xEE \ // r15 = {p15 o15 n15 m15 l15 k15 j15 i15 h15 g15 f15 e15 d15 c15 b15 a15}
|
||||
vshuff64x2 _r7, _r5, _r13, 0x44 \ // r7 = {p7 o7 n7 m7 l7 k7 j7 i7 h7 g7 f7 e7 d7 c7 b7 a7}
|
||||
\
|
||||
vshuff64x2 _r13, _t0, _r10, 0xEE \ // r13 = {p13 o13 n13 m13 l13 k13 j13 i13 h13 g13 f13 e13 d13 c13 b13 a13}
|
||||
vshuff64x2 _r5, _t0, _r10, 0x44 \ // r5 = {p5 o5 n5 m5 l5 k5 j5 i5 h5 g5 f5 e5 d5 c5 b5 a5}
|
||||
\
|
||||
vshuff64x2 _r10, _r3, _r11, 0xEE \ // r10 = {p10 o10 n10 m10 l10 k10 j10 i10 h10 g10 f10 e10 d10 c10 b10 a10}
|
||||
vshuff64x2 _t0, _r3, _r11, 0x44 \ // t0 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
|
||||
\
|
||||
vshuff64x2 _r11, _r1, _r9, 0xEE \ // r11 = {p11 o11 n11 m11 l11 k11 j11 i11 h11 g11 f11 e11 d11 c11 b11 a11}
|
||||
vshuff64x2 _r3, _r1, _r9, 0x44 \ // r3 = {p3 o3 n3 m3 l3 k3 j3 i3 h3 g3 f3 e3 d3 c3 b3 a3}
|
||||
\
|
||||
vshuff64x2 _r9, _r2, _r6, 0xEE \ // r9 = {p9 o9 n9 m9 l9 k9 j9 i9 h9 g9 f9 e9 d9 c9 b9 a9}
|
||||
vshuff64x2 _r1, _r2, _r6, 0x44 \ // r1 = {p1 o1 n1 m1 l1 k1 j1 i1 h1 g1 f1 e1 d1 c1 b1 a1}
|
||||
\
|
||||
vmovdqu32 _r2, _t0 \ // r2 = {p2 o2 n2 m2 l2 k2 j2 i2 h2 g2 f2 e2 d2 c2 b2 a2}
|
||||
vmovdqu32 _r6, _t1 \ // r6 = {p6 o6 n6 m6 l6 k6 j6 i6 h6 g6 f6 e6 d6 c6 b6 a6}
|
||||
|
||||
|
||||
// CH(A, B, C) = (A&B) ^ (~A&C)
|
||||
// MAJ(E, F, G) = (E&F) ^ (E&G) ^ (F&G)
|
||||
// SIGMA0 = ROR_2 ^ ROR_13 ^ ROR_22
|
||||
// SIGMA1 = ROR_6 ^ ROR_11 ^ ROR_25
|
||||
// sigma0 = ROR_7 ^ ROR_18 ^ SHR_3
|
||||
// sigma1 = ROR_17 ^ ROR_19 ^ SHR_10
|
||||
|
||||
// Main processing loop per round
|
||||
#define PROCESS_LOOP(_WT, _ROUND, _A, _B, _C, _D, _E, _F, _G, _H) \
|
||||
\ // T1 = H + SIGMA1(E) + CH(E, F, G) + Kt + Wt
|
||||
\ // T2 = SIGMA0(A) + MAJ(A, B, C)
|
||||
\ // H=G, G=F, F=E, E=D+T1, D=C, C=B, B=A, A=T1+T2
|
||||
\
|
||||
\ // H becomes T2, then add T1 for A
|
||||
\ // D becomes D + T1 for E
|
||||
\
|
||||
vpaddd T1, _H, TMP3 \ // T1 = H + Kt
|
||||
vmovdqu32 TMP0, _E \
|
||||
vprord TMP1, _E, 6 \ // ROR_6(E)
|
||||
vprord TMP2, _E, 11 \ // ROR_11(E)
|
||||
vprord TMP3, _E, 25 \ // ROR_25(E)
|
||||
vpternlogd TMP0, _F, _G, 0xCA \ // TMP0 = CH(E,F,G)
|
||||
vpaddd T1, T1, _WT \ // T1 = T1 + Wt
|
||||
vpternlogd TMP1, TMP2, TMP3, 0x96 \ // TMP1 = SIGMA1(E)
|
||||
vpaddd T1, T1, TMP0 \ // T1 = T1 + CH(E,F,G)
|
||||
vpaddd T1, T1, TMP1 \ // T1 = T1 + SIGMA1(E)
|
||||
vpaddd _D, _D, T1 \ // D = D + T1
|
||||
\
|
||||
vprord _H, _A, 2 \ // ROR_2(A)
|
||||
vprord TMP2, _A, 13 \ // ROR_13(A)
|
||||
vprord TMP3, _A, 22 \ // ROR_22(A)
|
||||
vmovdqu32 TMP0, _A \
|
||||
vpternlogd TMP0, _B, _C, 0xE8 \ // TMP0 = MAJ(A,B,C)
|
||||
vpternlogd _H, TMP2, TMP3, 0x96 \ // H(T2) = SIGMA0(A)
|
||||
vpaddd _H, _H, TMP0 \ // H(T2) = SIGMA0(A) + MAJ(A,B,C)
|
||||
vpaddd _H, _H, T1 \ // H(A) = H(T2) + T1
|
||||
\
|
||||
vmovdqu32 TMP3, [TBL + ((_ROUND+1)*64)] \ // Next Kt
|
||||
|
||||
|
||||
#define MSG_SCHED_ROUND_16_63(_WT, _WTp1, _WTp9, _WTp14) \
|
||||
vprord TMP4, _WTp14, 17 \ // ROR_17(Wt-2)
|
||||
vprord TMP5, _WTp14, 19 \ // ROR_19(Wt-2)
|
||||
vpsrld TMP6, _WTp14, 10 \ // SHR_10(Wt-2)
|
||||
vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma1(Wt-2)
|
||||
\
|
||||
vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2)
|
||||
vpaddd _WT, _WT, _WTp9 \ // Wt = Wt-16 + sigma1(Wt-2) + Wt-7
|
||||
\
|
||||
vprord TMP4, _WTp1, 7 \ // ROR_7(Wt-15)
|
||||
vprord TMP5, _WTp1, 18 \ // ROR_18(Wt-15)
|
||||
vpsrld TMP6, _WTp1, 3 \ // SHR_3(Wt-15)
|
||||
vpternlogd TMP4, TMP5, TMP6, 0x96 \ // TMP4 = sigma0(Wt-15)
|
||||
\
|
||||
vpaddd _WT, _WT, TMP4 \ // Wt = Wt-16 + sigma1(Wt-2) +
|
||||
\ // Wt-7 + sigma0(Wt-15) +
|
||||
|
||||
|
||||
// Note this is reading in a block of data for one lane
|
||||
// When all 16 are read, the data must be transposed to build msg schedule
|
||||
#define MSG_SCHED_ROUND_00_15(_WT, OFFSET, LABEL) \
|
||||
TESTQ $(1<<OFFSET), MASK_P9 \
|
||||
JE LABEL \
|
||||
MOVQ OFFSET*24(INPUT_P9), R9 \
|
||||
vmovups _WT, [inp0+IDX] \
|
||||
LABEL: \
|
||||
|
||||
#define MASKED_LOAD(_WT, OFFSET, LABEL) \
|
||||
TESTQ $(1<<OFFSET), MASK_P9 \
|
||||
JE LABEL \
|
||||
MOVQ OFFSET*24(INPUT_P9), R9 \
|
||||
vmovups _WT,[inp0+IDX] \
|
||||
LABEL: \
|
||||
|
||||
TEXT ·sha256_x16_avx512(SB), 7, $0
|
||||
MOVQ digests+0(FP), STATE_P9 //
|
||||
MOVQ scratch+8(FP), SCRATCH_P9
|
||||
MOVQ mask_len+32(FP), INP_SIZE_P9 // number of blocks to process
|
||||
MOVQ mask+24(FP), MASKP_P9
|
||||
MOVQ (MASKP_P9), MASK_P9
|
||||
kmovq k1, mask
|
||||
LEAQ inputs+48(FP), INPUT_P9
|
||||
|
||||
// Initialize digests
|
||||
vmovdqu32 A, [STATE + 0*SHA256_DIGEST_ROW_SIZE]
|
||||
vmovdqu32 B, [STATE + 1*SHA256_DIGEST_ROW_SIZE]
|
||||
vmovdqu32 C, [STATE + 2*SHA256_DIGEST_ROW_SIZE]
|
||||
vmovdqu32 D, [STATE + 3*SHA256_DIGEST_ROW_SIZE]
|
||||
vmovdqu32 E, [STATE + 4*SHA256_DIGEST_ROW_SIZE]
|
||||
vmovdqu32 F, [STATE + 5*SHA256_DIGEST_ROW_SIZE]
|
||||
vmovdqu32 G, [STATE + 6*SHA256_DIGEST_ROW_SIZE]
|
||||
vmovdqu32 H, [STATE + 7*SHA256_DIGEST_ROW_SIZE]
|
||||
|
||||
MOVQ table+16(FP), TBL_P9
|
||||
|
||||
xor IDX, IDX
|
||||
|
||||
// Read in first block of input data
|
||||
MASKED_LOAD( W0, 0, skipInput0)
|
||||
MASKED_LOAD( W1, 1, skipInput1)
|
||||
MASKED_LOAD( W2, 2, skipInput2)
|
||||
MASKED_LOAD( W3, 3, skipInput3)
|
||||
MASKED_LOAD( W4, 4, skipInput4)
|
||||
MASKED_LOAD( W5, 5, skipInput5)
|
||||
MASKED_LOAD( W6, 6, skipInput6)
|
||||
MASKED_LOAD( W7, 7, skipInput7)
|
||||
MASKED_LOAD( W8, 8, skipInput8)
|
||||
MASKED_LOAD( W9, 9, skipInput9)
|
||||
MASKED_LOAD(W10, 10, skipInput10)
|
||||
MASKED_LOAD(W11, 11, skipInput11)
|
||||
MASKED_LOAD(W12, 12, skipInput12)
|
||||
MASKED_LOAD(W13, 13, skipInput13)
|
||||
MASKED_LOAD(W14, 14, skipInput14)
|
||||
MASKED_LOAD(W15, 15, skipInput15)
|
||||
|
||||
lloop:
|
||||
LEAQ PSHUFFLE_BYTE_FLIP_MASK<>(SB), TBL_P9
|
||||
vmovdqu32 TMP2, [TBL]
|
||||
|
||||
// Get first K from table
|
||||
MOVQ table+16(FP), TBL_P9
|
||||
vmovdqu32 TMP3, [TBL]
|
||||
|
||||
// Save digests for later addition
|
||||
vmovdqu32 [SCRATCH + 64*0], A
|
||||
vmovdqu32 [SCRATCH + 64*1], B
|
||||
vmovdqu32 [SCRATCH + 64*2], C
|
||||
vmovdqu32 [SCRATCH + 64*3], D
|
||||
vmovdqu32 [SCRATCH + 64*4], E
|
||||
vmovdqu32 [SCRATCH + 64*5], F
|
||||
vmovdqu32 [SCRATCH + 64*6], G
|
||||
vmovdqu32 [SCRATCH + 64*7], H
|
||||
|
||||
add IDX, 64
|
||||
|
||||
// Transpose input data
|
||||
TRANSPOSE16(W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11, W12, W13, W14, W15, TMP0, TMP1)
|
||||
|
||||
vpshufb W0, W0, TMP2
|
||||
vpshufb W1, W1, TMP2
|
||||
vpshufb W2, W2, TMP2
|
||||
vpshufb W3, W3, TMP2
|
||||
vpshufb W4, W4, TMP2
|
||||
vpshufb W5, W5, TMP2
|
||||
vpshufb W6, W6, TMP2
|
||||
vpshufb W7, W7, TMP2
|
||||
vpshufb W8, W8, TMP2
|
||||
vpshufb W9, W9, TMP2
|
||||
vpshufb W10, W10, TMP2
|
||||
vpshufb W11, W11, TMP2
|
||||
vpshufb W12, W12, TMP2
|
||||
vpshufb W13, W13, TMP2
|
||||
vpshufb W14, W14, TMP2
|
||||
vpshufb W15, W15, TMP2
|
||||
|
||||
// MSG Schedule for W0-W15 is now complete in registers
|
||||
// Process first 48 rounds
|
||||
// Calculate next Wt+16 after processing is complete and Wt is unneeded
|
||||
|
||||
PROCESS_LOOP( W0, 0, A, B, C, D, E, F, G, H)
|
||||
MSG_SCHED_ROUND_16_63( W0, W1, W9, W14)
|
||||
PROCESS_LOOP( W1, 1, H, A, B, C, D, E, F, G)
|
||||
MSG_SCHED_ROUND_16_63( W1, W2, W10, W15)
|
||||
PROCESS_LOOP( W2, 2, G, H, A, B, C, D, E, F)
|
||||
MSG_SCHED_ROUND_16_63( W2, W3, W11, W0)
|
||||
PROCESS_LOOP( W3, 3, F, G, H, A, B, C, D, E)
|
||||
MSG_SCHED_ROUND_16_63( W3, W4, W12, W1)
|
||||
PROCESS_LOOP( W4, 4, E, F, G, H, A, B, C, D)
|
||||
MSG_SCHED_ROUND_16_63( W4, W5, W13, W2)
|
||||
PROCESS_LOOP( W5, 5, D, E, F, G, H, A, B, C)
|
||||
MSG_SCHED_ROUND_16_63( W5, W6, W14, W3)
|
||||
PROCESS_LOOP( W6, 6, C, D, E, F, G, H, A, B)
|
||||
MSG_SCHED_ROUND_16_63( W6, W7, W15, W4)
|
||||
PROCESS_LOOP( W7, 7, B, C, D, E, F, G, H, A)
|
||||
MSG_SCHED_ROUND_16_63( W7, W8, W0, W5)
|
||||
PROCESS_LOOP( W8, 8, A, B, C, D, E, F, G, H)
|
||||
MSG_SCHED_ROUND_16_63( W8, W9, W1, W6)
|
||||
PROCESS_LOOP( W9, 9, H, A, B, C, D, E, F, G)
|
||||
MSG_SCHED_ROUND_16_63( W9, W10, W2, W7)
|
||||
PROCESS_LOOP(W10, 10, G, H, A, B, C, D, E, F)
|
||||
MSG_SCHED_ROUND_16_63(W10, W11, W3, W8)
|
||||
PROCESS_LOOP(W11, 11, F, G, H, A, B, C, D, E)
|
||||
MSG_SCHED_ROUND_16_63(W11, W12, W4, W9)
|
||||
PROCESS_LOOP(W12, 12, E, F, G, H, A, B, C, D)
|
||||
MSG_SCHED_ROUND_16_63(W12, W13, W5, W10)
|
||||
PROCESS_LOOP(W13, 13, D, E, F, G, H, A, B, C)
|
||||
MSG_SCHED_ROUND_16_63(W13, W14, W6, W11)
|
||||
PROCESS_LOOP(W14, 14, C, D, E, F, G, H, A, B)
|
||||
MSG_SCHED_ROUND_16_63(W14, W15, W7, W12)
|
||||
PROCESS_LOOP(W15, 15, B, C, D, E, F, G, H, A)
|
||||
MSG_SCHED_ROUND_16_63(W15, W0, W8, W13)
|
||||
PROCESS_LOOP( W0, 16, A, B, C, D, E, F, G, H)
|
||||
MSG_SCHED_ROUND_16_63( W0, W1, W9, W14)
|
||||
PROCESS_LOOP( W1, 17, H, A, B, C, D, E, F, G)
|
||||
MSG_SCHED_ROUND_16_63( W1, W2, W10, W15)
|
||||
PROCESS_LOOP( W2, 18, G, H, A, B, C, D, E, F)
|
||||
MSG_SCHED_ROUND_16_63( W2, W3, W11, W0)
|
||||
PROCESS_LOOP( W3, 19, F, G, H, A, B, C, D, E)
|
||||
MSG_SCHED_ROUND_16_63( W3, W4, W12, W1)
|
||||
PROCESS_LOOP( W4, 20, E, F, G, H, A, B, C, D)
|
||||
MSG_SCHED_ROUND_16_63( W4, W5, W13, W2)
|
||||
PROCESS_LOOP( W5, 21, D, E, F, G, H, A, B, C)
|
||||
MSG_SCHED_ROUND_16_63( W5, W6, W14, W3)
|
||||
PROCESS_LOOP( W6, 22, C, D, E, F, G, H, A, B)
|
||||
MSG_SCHED_ROUND_16_63( W6, W7, W15, W4)
|
||||
PROCESS_LOOP( W7, 23, B, C, D, E, F, G, H, A)
|
||||
MSG_SCHED_ROUND_16_63( W7, W8, W0, W5)
|
||||
PROCESS_LOOP( W8, 24, A, B, C, D, E, F, G, H)
|
||||
MSG_SCHED_ROUND_16_63( W8, W9, W1, W6)
|
||||
PROCESS_LOOP( W9, 25, H, A, B, C, D, E, F, G)
|
||||
MSG_SCHED_ROUND_16_63( W9, W10, W2, W7)
|
||||
PROCESS_LOOP(W10, 26, G, H, A, B, C, D, E, F)
|
||||
MSG_SCHED_ROUND_16_63(W10, W11, W3, W8)
|
||||
PROCESS_LOOP(W11, 27, F, G, H, A, B, C, D, E)
|
||||
MSG_SCHED_ROUND_16_63(W11, W12, W4, W9)
|
||||
PROCESS_LOOP(W12, 28, E, F, G, H, A, B, C, D)
|
||||
MSG_SCHED_ROUND_16_63(W12, W13, W5, W10)
|
||||
PROCESS_LOOP(W13, 29, D, E, F, G, H, A, B, C)
|
||||
MSG_SCHED_ROUND_16_63(W13, W14, W6, W11)
|
||||
PROCESS_LOOP(W14, 30, C, D, E, F, G, H, A, B)
|
||||
MSG_SCHED_ROUND_16_63(W14, W15, W7, W12)
|
||||
PROCESS_LOOP(W15, 31, B, C, D, E, F, G, H, A)
|
||||
MSG_SCHED_ROUND_16_63(W15, W0, W8, W13)
|
||||
PROCESS_LOOP( W0, 32, A, B, C, D, E, F, G, H)
|
||||
MSG_SCHED_ROUND_16_63( W0, W1, W9, W14)
|
||||
PROCESS_LOOP( W1, 33, H, A, B, C, D, E, F, G)
|
||||
MSG_SCHED_ROUND_16_63( W1, W2, W10, W15)
|
||||
PROCESS_LOOP( W2, 34, G, H, A, B, C, D, E, F)
|
||||
MSG_SCHED_ROUND_16_63( W2, W3, W11, W0)
|
||||
PROCESS_LOOP( W3, 35, F, G, H, A, B, C, D, E)
|
||||
MSG_SCHED_ROUND_16_63( W3, W4, W12, W1)
|
||||
PROCESS_LOOP( W4, 36, E, F, G, H, A, B, C, D)
|
||||
MSG_SCHED_ROUND_16_63( W4, W5, W13, W2)
|
||||
PROCESS_LOOP( W5, 37, D, E, F, G, H, A, B, C)
|
||||
MSG_SCHED_ROUND_16_63( W5, W6, W14, W3)
|
||||
PROCESS_LOOP( W6, 38, C, D, E, F, G, H, A, B)
|
||||
MSG_SCHED_ROUND_16_63( W6, W7, W15, W4)
|
||||
PROCESS_LOOP( W7, 39, B, C, D, E, F, G, H, A)
|
||||
MSG_SCHED_ROUND_16_63( W7, W8, W0, W5)
|
||||
PROCESS_LOOP( W8, 40, A, B, C, D, E, F, G, H)
|
||||
MSG_SCHED_ROUND_16_63( W8, W9, W1, W6)
|
||||
PROCESS_LOOP( W9, 41, H, A, B, C, D, E, F, G)
|
||||
MSG_SCHED_ROUND_16_63( W9, W10, W2, W7)
|
||||
PROCESS_LOOP(W10, 42, G, H, A, B, C, D, E, F)
|
||||
MSG_SCHED_ROUND_16_63(W10, W11, W3, W8)
|
||||
PROCESS_LOOP(W11, 43, F, G, H, A, B, C, D, E)
|
||||
MSG_SCHED_ROUND_16_63(W11, W12, W4, W9)
|
||||
PROCESS_LOOP(W12, 44, E, F, G, H, A, B, C, D)
|
||||
MSG_SCHED_ROUND_16_63(W12, W13, W5, W10)
|
||||
PROCESS_LOOP(W13, 45, D, E, F, G, H, A, B, C)
|
||||
MSG_SCHED_ROUND_16_63(W13, W14, W6, W11)
|
||||
PROCESS_LOOP(W14, 46, C, D, E, F, G, H, A, B)
|
||||
MSG_SCHED_ROUND_16_63(W14, W15, W7, W12)
|
||||
PROCESS_LOOP(W15, 47, B, C, D, E, F, G, H, A)
|
||||
MSG_SCHED_ROUND_16_63(W15, W0, W8, W13)
|
||||
|
||||
// Check if this is the last block
|
||||
sub INP_SIZE, 1
|
||||
JE lastLoop
|
||||
|
||||
// Load next mask for inputs
|
||||
ADDQ $8, MASKP_P9
|
||||
MOVQ (MASKP_P9), MASK_P9
|
||||
|
||||
// Process last 16 rounds
|
||||
// Read in next block msg data for use in first 16 words of msg sched
|
||||
|
||||
PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H)
|
||||
MSG_SCHED_ROUND_00_15( W0, 0, skipNext0)
|
||||
PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G)
|
||||
MSG_SCHED_ROUND_00_15( W1, 1, skipNext1)
|
||||
PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F)
|
||||
MSG_SCHED_ROUND_00_15( W2, 2, skipNext2)
|
||||
PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E)
|
||||
MSG_SCHED_ROUND_00_15( W3, 3, skipNext3)
|
||||
PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D)
|
||||
MSG_SCHED_ROUND_00_15( W4, 4, skipNext4)
|
||||
PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C)
|
||||
MSG_SCHED_ROUND_00_15( W5, 5, skipNext5)
|
||||
PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B)
|
||||
MSG_SCHED_ROUND_00_15( W6, 6, skipNext6)
|
||||
PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A)
|
||||
MSG_SCHED_ROUND_00_15( W7, 7, skipNext7)
|
||||
PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H)
|
||||
MSG_SCHED_ROUND_00_15( W8, 8, skipNext8)
|
||||
PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G)
|
||||
MSG_SCHED_ROUND_00_15( W9, 9, skipNext9)
|
||||
PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F)
|
||||
MSG_SCHED_ROUND_00_15(W10, 10, skipNext10)
|
||||
PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E)
|
||||
MSG_SCHED_ROUND_00_15(W11, 11, skipNext11)
|
||||
PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D)
|
||||
MSG_SCHED_ROUND_00_15(W12, 12, skipNext12)
|
||||
PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C)
|
||||
MSG_SCHED_ROUND_00_15(W13, 13, skipNext13)
|
||||
PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B)
|
||||
MSG_SCHED_ROUND_00_15(W14, 14, skipNext14)
|
||||
PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A)
|
||||
MSG_SCHED_ROUND_00_15(W15, 15, skipNext15)
|
||||
|
||||
// Add old digest
|
||||
vmovdqu32 TMP2, A
|
||||
vmovdqu32 A, [SCRATCH + 64*0]
|
||||
vpaddd A{k1}, A, TMP2
|
||||
vmovdqu32 TMP2, B
|
||||
vmovdqu32 B, [SCRATCH + 64*1]
|
||||
vpaddd B{k1}, B, TMP2
|
||||
vmovdqu32 TMP2, C
|
||||
vmovdqu32 C, [SCRATCH + 64*2]
|
||||
vpaddd C{k1}, C, TMP2
|
||||
vmovdqu32 TMP2, D
|
||||
vmovdqu32 D, [SCRATCH + 64*3]
|
||||
vpaddd D{k1}, D, TMP2
|
||||
vmovdqu32 TMP2, E
|
||||
vmovdqu32 E, [SCRATCH + 64*4]
|
||||
vpaddd E{k1}, E, TMP2
|
||||
vmovdqu32 TMP2, F
|
||||
vmovdqu32 F, [SCRATCH + 64*5]
|
||||
vpaddd F{k1}, F, TMP2
|
||||
vmovdqu32 TMP2, G
|
||||
vmovdqu32 G, [SCRATCH + 64*6]
|
||||
vpaddd G{k1}, G, TMP2
|
||||
vmovdqu32 TMP2, H
|
||||
vmovdqu32 H, [SCRATCH + 64*7]
|
||||
vpaddd H{k1}, H, TMP2
|
||||
|
||||
kmovq k1, mask
|
||||
JMP lloop
|
||||
|
||||
lastLoop:
|
||||
// Process last 16 rounds
|
||||
PROCESS_LOOP( W0, 48, A, B, C, D, E, F, G, H)
|
||||
PROCESS_LOOP( W1, 49, H, A, B, C, D, E, F, G)
|
||||
PROCESS_LOOP( W2, 50, G, H, A, B, C, D, E, F)
|
||||
PROCESS_LOOP( W3, 51, F, G, H, A, B, C, D, E)
|
||||
PROCESS_LOOP( W4, 52, E, F, G, H, A, B, C, D)
|
||||
PROCESS_LOOP( W5, 53, D, E, F, G, H, A, B, C)
|
||||
PROCESS_LOOP( W6, 54, C, D, E, F, G, H, A, B)
|
||||
PROCESS_LOOP( W7, 55, B, C, D, E, F, G, H, A)
|
||||
PROCESS_LOOP( W8, 56, A, B, C, D, E, F, G, H)
|
||||
PROCESS_LOOP( W9, 57, H, A, B, C, D, E, F, G)
|
||||
PROCESS_LOOP(W10, 58, G, H, A, B, C, D, E, F)
|
||||
PROCESS_LOOP(W11, 59, F, G, H, A, B, C, D, E)
|
||||
PROCESS_LOOP(W12, 60, E, F, G, H, A, B, C, D)
|
||||
PROCESS_LOOP(W13, 61, D, E, F, G, H, A, B, C)
|
||||
PROCESS_LOOP(W14, 62, C, D, E, F, G, H, A, B)
|
||||
PROCESS_LOOP(W15, 63, B, C, D, E, F, G, H, A)
|
||||
|
||||
// Add old digest
|
||||
vmovdqu32 TMP2, A
|
||||
vmovdqu32 A, [SCRATCH + 64*0]
|
||||
vpaddd A{k1}, A, TMP2
|
||||
vmovdqu32 TMP2, B
|
||||
vmovdqu32 B, [SCRATCH + 64*1]
|
||||
vpaddd B{k1}, B, TMP2
|
||||
vmovdqu32 TMP2, C
|
||||
vmovdqu32 C, [SCRATCH + 64*2]
|
||||
vpaddd C{k1}, C, TMP2
|
||||
vmovdqu32 TMP2, D
|
||||
vmovdqu32 D, [SCRATCH + 64*3]
|
||||
vpaddd D{k1}, D, TMP2
|
||||
vmovdqu32 TMP2, E
|
||||
vmovdqu32 E, [SCRATCH + 64*4]
|
||||
vpaddd E{k1}, E, TMP2
|
||||
vmovdqu32 TMP2, F
|
||||
vmovdqu32 F, [SCRATCH + 64*5]
|
||||
vpaddd F{k1}, F, TMP2
|
||||
vmovdqu32 TMP2, G
|
||||
vmovdqu32 G, [SCRATCH + 64*6]
|
||||
vpaddd G{k1}, G, TMP2
|
||||
vmovdqu32 TMP2, H
|
||||
vmovdqu32 H, [SCRATCH + 64*7]
|
||||
vpaddd H{k1}, H, TMP2
|
||||
|
||||
// Write out digest
|
||||
vmovdqu32 [STATE + 0*SHA256_DIGEST_ROW_SIZE], A
|
||||
vmovdqu32 [STATE + 1*SHA256_DIGEST_ROW_SIZE], B
|
||||
vmovdqu32 [STATE + 2*SHA256_DIGEST_ROW_SIZE], C
|
||||
vmovdqu32 [STATE + 3*SHA256_DIGEST_ROW_SIZE], D
|
||||
vmovdqu32 [STATE + 4*SHA256_DIGEST_ROW_SIZE], E
|
||||
vmovdqu32 [STATE + 5*SHA256_DIGEST_ROW_SIZE], F
|
||||
vmovdqu32 [STATE + 6*SHA256_DIGEST_ROW_SIZE], G
|
||||
vmovdqu32 [STATE + 7*SHA256_DIGEST_ROW_SIZE], H
|
||||
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
//
|
||||
// Tables
|
||||
//
|
||||
|
||||
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x000(SB)/8, $0x0405060700010203
|
||||
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x008(SB)/8, $0x0c0d0e0f08090a0b
|
||||
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x010(SB)/8, $0x0405060700010203
|
||||
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x018(SB)/8, $0x0c0d0e0f08090a0b
|
||||
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x020(SB)/8, $0x0405060700010203
|
||||
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x028(SB)/8, $0x0c0d0e0f08090a0b
|
||||
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x030(SB)/8, $0x0405060700010203
|
||||
DATA PSHUFFLE_BYTE_FLIP_MASK<>+0x038(SB)/8, $0x0c0d0e0f08090a0b
|
||||
GLOBL PSHUFFLE_BYTE_FLIP_MASK<>(SB), 8, $64
|
||||
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x000(SB)/8, $0x0000000000000000
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x008(SB)/8, $0x0000000000000001
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x010(SB)/8, $0x0000000000000008
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x018(SB)/8, $0x0000000000000009
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x020(SB)/8, $0x0000000000000004
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x028(SB)/8, $0x0000000000000005
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x030(SB)/8, $0x000000000000000C
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK1<>+0x038(SB)/8, $0x000000000000000D
|
||||
GLOBL PSHUFFLE_TRANSPOSE16_MASK1<>(SB), 8, $64
|
||||
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x000(SB)/8, $0x0000000000000002
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x008(SB)/8, $0x0000000000000003
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x010(SB)/8, $0x000000000000000A
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x018(SB)/8, $0x000000000000000B
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x020(SB)/8, $0x0000000000000006
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x028(SB)/8, $0x0000000000000007
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x030(SB)/8, $0x000000000000000E
|
||||
DATA PSHUFFLE_TRANSPOSE16_MASK2<>+0x038(SB)/8, $0x000000000000000F
|
||||
GLOBL PSHUFFLE_TRANSPOSE16_MASK2<>(SB), 8, $64
|
||||
@@ -1,663 +0,0 @@
|
||||
//go:build !noasm && !appengine && gc
|
||||
// +build !noasm,!appengine,gc
|
||||
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2017 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package sha256
|
||||
|
||||
import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"hash"
|
||||
"sort"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
func sha256X16Avx512(
|
||||
digests *[512]byte, scratch *[512]byte, table *[512]uint64, mask []uint64,
|
||||
inputs [16][]byte,
|
||||
)
|
||||
|
||||
// Avx512ServerUID - Do not start at 0 but next multiple of 16 so as to be able to
|
||||
// differentiate with default initialiation value of 0
|
||||
const Avx512ServerUID = 16
|
||||
|
||||
var uidCounter uint64
|
||||
|
||||
// NewAvx512 - initialize sha256 Avx512 implementation.
|
||||
func NewAvx512(a512srv *Avx512Server) hash.Hash {
|
||||
uid := atomic.AddUint64(&uidCounter, 1)
|
||||
return &Avx512Digest{uid: uid, a512srv: a512srv}
|
||||
}
|
||||
|
||||
// Avx512Digest - Type for computing SHA256 using Avx512
|
||||
type Avx512Digest struct {
|
||||
uid uint64
|
||||
a512srv *Avx512Server
|
||||
x [chunk]byte
|
||||
nx int
|
||||
len uint64
|
||||
final bool
|
||||
result [Size]byte
|
||||
}
|
||||
|
||||
// Size - Return size of checksum
|
||||
func (d *Avx512Digest) Size() int { return Size }
|
||||
|
||||
// BlockSize - Return blocksize of checksum
|
||||
func (d Avx512Digest) BlockSize() int { return BlockSize }
|
||||
|
||||
// Reset - reset sha digest to its initial values
|
||||
func (d *Avx512Digest) Reset() {
|
||||
d.a512srv.blocksCh <- blockInput{uid: d.uid, reset: true}
|
||||
d.nx = 0
|
||||
d.len = 0
|
||||
d.final = false
|
||||
}
|
||||
|
||||
// Write to digest
|
||||
func (d *Avx512Digest) Write(p []byte) (nn int, err error) {
|
||||
|
||||
if d.final {
|
||||
return 0, errors.New("Avx512Digest already finalized. Reset first before writing again")
|
||||
}
|
||||
|
||||
nn = len(p)
|
||||
d.len += uint64(nn)
|
||||
if d.nx > 0 {
|
||||
n := copy(d.x[d.nx:], p)
|
||||
d.nx += n
|
||||
if d.nx == chunk {
|
||||
d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: d.x[:]}
|
||||
d.nx = 0
|
||||
}
|
||||
p = p[n:]
|
||||
}
|
||||
if len(p) >= chunk {
|
||||
n := len(p) &^ (chunk - 1)
|
||||
d.a512srv.blocksCh <- blockInput{uid: d.uid, msg: p[:n]}
|
||||
p = p[n:]
|
||||
}
|
||||
if len(p) > 0 {
|
||||
d.nx = copy(d.x[:], p)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Sum - Return sha256 sum in bytes
|
||||
func (d *Avx512Digest) Sum(in []byte) (result []byte) {
|
||||
|
||||
if d.final {
|
||||
return append(in, d.result[:]...)
|
||||
}
|
||||
|
||||
trail := make([]byte, 0, 128)
|
||||
trail = append(trail, d.x[:d.nx]...)
|
||||
|
||||
len := d.len
|
||||
// Padding. Add a 1 bit and 0 bits until 56 bytes mod 64.
|
||||
var tmp [64]byte
|
||||
tmp[0] = 0x80
|
||||
if len%64 < 56 {
|
||||
trail = append(trail, tmp[0:56-len%64]...)
|
||||
} else {
|
||||
trail = append(trail, tmp[0:64+56-len%64]...)
|
||||
}
|
||||
d.nx = 0
|
||||
|
||||
// Length in bits.
|
||||
len <<= 3
|
||||
for i := uint(0); i < 8; i++ {
|
||||
tmp[i] = byte(len >> (56 - 8*i))
|
||||
}
|
||||
trail = append(trail, tmp[0:8]...)
|
||||
|
||||
sumCh := make(chan [Size]byte)
|
||||
d.a512srv.blocksCh <- blockInput{
|
||||
uid: d.uid, msg: trail, final: true, sumCh: sumCh,
|
||||
}
|
||||
d.result = <-sumCh
|
||||
d.final = true
|
||||
return append(in, d.result[:]...)
|
||||
}
|
||||
|
||||
var table = [512]uint64{
|
||||
0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
|
||||
0x428a2f98428a2f98,
|
||||
0x428a2f98428a2f98, 0x428a2f98428a2f98, 0x428a2f98428a2f98,
|
||||
0x428a2f98428a2f98,
|
||||
0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
|
||||
0x7137449171374491,
|
||||
0x7137449171374491, 0x7137449171374491, 0x7137449171374491,
|
||||
0x7137449171374491,
|
||||
0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
|
||||
0xb5c0fbcfb5c0fbcf,
|
||||
0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf, 0xb5c0fbcfb5c0fbcf,
|
||||
0xb5c0fbcfb5c0fbcf,
|
||||
0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
|
||||
0xe9b5dba5e9b5dba5,
|
||||
0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5, 0xe9b5dba5e9b5dba5,
|
||||
0xe9b5dba5e9b5dba5,
|
||||
0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
|
||||
0x3956c25b3956c25b,
|
||||
0x3956c25b3956c25b, 0x3956c25b3956c25b, 0x3956c25b3956c25b,
|
||||
0x3956c25b3956c25b,
|
||||
0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
|
||||
0x59f111f159f111f1,
|
||||
0x59f111f159f111f1, 0x59f111f159f111f1, 0x59f111f159f111f1,
|
||||
0x59f111f159f111f1,
|
||||
0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
|
||||
0x923f82a4923f82a4,
|
||||
0x923f82a4923f82a4, 0x923f82a4923f82a4, 0x923f82a4923f82a4,
|
||||
0x923f82a4923f82a4,
|
||||
0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
|
||||
0xab1c5ed5ab1c5ed5,
|
||||
0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5, 0xab1c5ed5ab1c5ed5,
|
||||
0xab1c5ed5ab1c5ed5,
|
||||
0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
|
||||
0xd807aa98d807aa98,
|
||||
0xd807aa98d807aa98, 0xd807aa98d807aa98, 0xd807aa98d807aa98,
|
||||
0xd807aa98d807aa98,
|
||||
0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
|
||||
0x12835b0112835b01,
|
||||
0x12835b0112835b01, 0x12835b0112835b01, 0x12835b0112835b01,
|
||||
0x12835b0112835b01,
|
||||
0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
|
||||
0x243185be243185be,
|
||||
0x243185be243185be, 0x243185be243185be, 0x243185be243185be,
|
||||
0x243185be243185be,
|
||||
0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
|
||||
0x550c7dc3550c7dc3,
|
||||
0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3, 0x550c7dc3550c7dc3,
|
||||
0x550c7dc3550c7dc3,
|
||||
0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
|
||||
0x72be5d7472be5d74,
|
||||
0x72be5d7472be5d74, 0x72be5d7472be5d74, 0x72be5d7472be5d74,
|
||||
0x72be5d7472be5d74,
|
||||
0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
|
||||
0x80deb1fe80deb1fe,
|
||||
0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe, 0x80deb1fe80deb1fe,
|
||||
0x80deb1fe80deb1fe,
|
||||
0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
|
||||
0x9bdc06a79bdc06a7,
|
||||
0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7, 0x9bdc06a79bdc06a7,
|
||||
0x9bdc06a79bdc06a7,
|
||||
0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
|
||||
0xc19bf174c19bf174,
|
||||
0xc19bf174c19bf174, 0xc19bf174c19bf174, 0xc19bf174c19bf174,
|
||||
0xc19bf174c19bf174,
|
||||
0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
|
||||
0xe49b69c1e49b69c1,
|
||||
0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1, 0xe49b69c1e49b69c1,
|
||||
0xe49b69c1e49b69c1,
|
||||
0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
|
||||
0xefbe4786efbe4786,
|
||||
0xefbe4786efbe4786, 0xefbe4786efbe4786, 0xefbe4786efbe4786,
|
||||
0xefbe4786efbe4786,
|
||||
0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
|
||||
0x0fc19dc60fc19dc6,
|
||||
0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6, 0x0fc19dc60fc19dc6,
|
||||
0x0fc19dc60fc19dc6,
|
||||
0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
|
||||
0x240ca1cc240ca1cc,
|
||||
0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc, 0x240ca1cc240ca1cc,
|
||||
0x240ca1cc240ca1cc,
|
||||
0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
|
||||
0x2de92c6f2de92c6f,
|
||||
0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f, 0x2de92c6f2de92c6f,
|
||||
0x2de92c6f2de92c6f,
|
||||
0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
|
||||
0x4a7484aa4a7484aa,
|
||||
0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa, 0x4a7484aa4a7484aa,
|
||||
0x4a7484aa4a7484aa,
|
||||
0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
|
||||
0x5cb0a9dc5cb0a9dc,
|
||||
0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc, 0x5cb0a9dc5cb0a9dc,
|
||||
0x5cb0a9dc5cb0a9dc,
|
||||
0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
|
||||
0x76f988da76f988da,
|
||||
0x76f988da76f988da, 0x76f988da76f988da, 0x76f988da76f988da,
|
||||
0x76f988da76f988da,
|
||||
0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
|
||||
0x983e5152983e5152,
|
||||
0x983e5152983e5152, 0x983e5152983e5152, 0x983e5152983e5152,
|
||||
0x983e5152983e5152,
|
||||
0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
|
||||
0xa831c66da831c66d,
|
||||
0xa831c66da831c66d, 0xa831c66da831c66d, 0xa831c66da831c66d,
|
||||
0xa831c66da831c66d,
|
||||
0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
|
||||
0xb00327c8b00327c8,
|
||||
0xb00327c8b00327c8, 0xb00327c8b00327c8, 0xb00327c8b00327c8,
|
||||
0xb00327c8b00327c8,
|
||||
0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
|
||||
0xbf597fc7bf597fc7,
|
||||
0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7, 0xbf597fc7bf597fc7,
|
||||
0xbf597fc7bf597fc7,
|
||||
0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
|
||||
0xc6e00bf3c6e00bf3,
|
||||
0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3, 0xc6e00bf3c6e00bf3,
|
||||
0xc6e00bf3c6e00bf3,
|
||||
0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
|
||||
0xd5a79147d5a79147,
|
||||
0xd5a79147d5a79147, 0xd5a79147d5a79147, 0xd5a79147d5a79147,
|
||||
0xd5a79147d5a79147,
|
||||
0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
|
||||
0x06ca635106ca6351,
|
||||
0x06ca635106ca6351, 0x06ca635106ca6351, 0x06ca635106ca6351,
|
||||
0x06ca635106ca6351,
|
||||
0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
|
||||
0x1429296714292967,
|
||||
0x1429296714292967, 0x1429296714292967, 0x1429296714292967,
|
||||
0x1429296714292967,
|
||||
0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
|
||||
0x27b70a8527b70a85,
|
||||
0x27b70a8527b70a85, 0x27b70a8527b70a85, 0x27b70a8527b70a85,
|
||||
0x27b70a8527b70a85,
|
||||
0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
|
||||
0x2e1b21382e1b2138,
|
||||
0x2e1b21382e1b2138, 0x2e1b21382e1b2138, 0x2e1b21382e1b2138,
|
||||
0x2e1b21382e1b2138,
|
||||
0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
|
||||
0x4d2c6dfc4d2c6dfc,
|
||||
0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc, 0x4d2c6dfc4d2c6dfc,
|
||||
0x4d2c6dfc4d2c6dfc,
|
||||
0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
|
||||
0x53380d1353380d13,
|
||||
0x53380d1353380d13, 0x53380d1353380d13, 0x53380d1353380d13,
|
||||
0x53380d1353380d13,
|
||||
0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
|
||||
0x650a7354650a7354,
|
||||
0x650a7354650a7354, 0x650a7354650a7354, 0x650a7354650a7354,
|
||||
0x650a7354650a7354,
|
||||
0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
|
||||
0x766a0abb766a0abb,
|
||||
0x766a0abb766a0abb, 0x766a0abb766a0abb, 0x766a0abb766a0abb,
|
||||
0x766a0abb766a0abb,
|
||||
0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
|
||||
0x81c2c92e81c2c92e,
|
||||
0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e, 0x81c2c92e81c2c92e,
|
||||
0x81c2c92e81c2c92e,
|
||||
0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
|
||||
0x92722c8592722c85,
|
||||
0x92722c8592722c85, 0x92722c8592722c85, 0x92722c8592722c85,
|
||||
0x92722c8592722c85,
|
||||
0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
|
||||
0xa2bfe8a1a2bfe8a1,
|
||||
0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1, 0xa2bfe8a1a2bfe8a1,
|
||||
0xa2bfe8a1a2bfe8a1,
|
||||
0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
|
||||
0xa81a664ba81a664b,
|
||||
0xa81a664ba81a664b, 0xa81a664ba81a664b, 0xa81a664ba81a664b,
|
||||
0xa81a664ba81a664b,
|
||||
0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
|
||||
0xc24b8b70c24b8b70,
|
||||
0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70, 0xc24b8b70c24b8b70,
|
||||
0xc24b8b70c24b8b70,
|
||||
0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
|
||||
0xc76c51a3c76c51a3,
|
||||
0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3, 0xc76c51a3c76c51a3,
|
||||
0xc76c51a3c76c51a3,
|
||||
0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
|
||||
0xd192e819d192e819,
|
||||
0xd192e819d192e819, 0xd192e819d192e819, 0xd192e819d192e819,
|
||||
0xd192e819d192e819,
|
||||
0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
|
||||
0xd6990624d6990624,
|
||||
0xd6990624d6990624, 0xd6990624d6990624, 0xd6990624d6990624,
|
||||
0xd6990624d6990624,
|
||||
0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
|
||||
0xf40e3585f40e3585,
|
||||
0xf40e3585f40e3585, 0xf40e3585f40e3585, 0xf40e3585f40e3585,
|
||||
0xf40e3585f40e3585,
|
||||
0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
|
||||
0x106aa070106aa070,
|
||||
0x106aa070106aa070, 0x106aa070106aa070, 0x106aa070106aa070,
|
||||
0x106aa070106aa070,
|
||||
0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
|
||||
0x19a4c11619a4c116,
|
||||
0x19a4c11619a4c116, 0x19a4c11619a4c116, 0x19a4c11619a4c116,
|
||||
0x19a4c11619a4c116,
|
||||
0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
|
||||
0x1e376c081e376c08,
|
||||
0x1e376c081e376c08, 0x1e376c081e376c08, 0x1e376c081e376c08,
|
||||
0x1e376c081e376c08,
|
||||
0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
|
||||
0x2748774c2748774c,
|
||||
0x2748774c2748774c, 0x2748774c2748774c, 0x2748774c2748774c,
|
||||
0x2748774c2748774c,
|
||||
0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
|
||||
0x34b0bcb534b0bcb5,
|
||||
0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5, 0x34b0bcb534b0bcb5,
|
||||
0x34b0bcb534b0bcb5,
|
||||
0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
|
||||
0x391c0cb3391c0cb3,
|
||||
0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3, 0x391c0cb3391c0cb3,
|
||||
0x391c0cb3391c0cb3,
|
||||
0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
|
||||
0x4ed8aa4a4ed8aa4a,
|
||||
0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a, 0x4ed8aa4a4ed8aa4a,
|
||||
0x4ed8aa4a4ed8aa4a,
|
||||
0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
|
||||
0x5b9cca4f5b9cca4f,
|
||||
0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f, 0x5b9cca4f5b9cca4f,
|
||||
0x5b9cca4f5b9cca4f,
|
||||
0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
|
||||
0x682e6ff3682e6ff3,
|
||||
0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3, 0x682e6ff3682e6ff3,
|
||||
0x682e6ff3682e6ff3,
|
||||
0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
|
||||
0x748f82ee748f82ee,
|
||||
0x748f82ee748f82ee, 0x748f82ee748f82ee, 0x748f82ee748f82ee,
|
||||
0x748f82ee748f82ee,
|
||||
0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
|
||||
0x78a5636f78a5636f,
|
||||
0x78a5636f78a5636f, 0x78a5636f78a5636f, 0x78a5636f78a5636f,
|
||||
0x78a5636f78a5636f,
|
||||
0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
|
||||
0x84c8781484c87814,
|
||||
0x84c8781484c87814, 0x84c8781484c87814, 0x84c8781484c87814,
|
||||
0x84c8781484c87814,
|
||||
0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
|
||||
0x8cc702088cc70208,
|
||||
0x8cc702088cc70208, 0x8cc702088cc70208, 0x8cc702088cc70208,
|
||||
0x8cc702088cc70208,
|
||||
0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
|
||||
0x90befffa90befffa,
|
||||
0x90befffa90befffa, 0x90befffa90befffa, 0x90befffa90befffa,
|
||||
0x90befffa90befffa,
|
||||
0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
|
||||
0xa4506ceba4506ceb,
|
||||
0xa4506ceba4506ceb, 0xa4506ceba4506ceb, 0xa4506ceba4506ceb,
|
||||
0xa4506ceba4506ceb,
|
||||
0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
|
||||
0xbef9a3f7bef9a3f7,
|
||||
0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7, 0xbef9a3f7bef9a3f7,
|
||||
0xbef9a3f7bef9a3f7,
|
||||
0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2,
|
||||
0xc67178f2c67178f2,
|
||||
0xc67178f2c67178f2, 0xc67178f2c67178f2, 0xc67178f2c67178f2,
|
||||
0xc67178f2c67178f2,
|
||||
}
|
||||
|
||||
// Interface function to assembly ode
|
||||
func blockAvx512(
|
||||
digests *[512]byte, input [16][]byte, mask []uint64,
|
||||
) [16][Size]byte {
|
||||
|
||||
scratch := [512]byte{}
|
||||
sha256X16Avx512(digests, &scratch, &table, mask, input)
|
||||
|
||||
output := [16][Size]byte{}
|
||||
for i := 0; i < 16; i++ {
|
||||
output[i] = getDigest(i, digests[:])
|
||||
}
|
||||
|
||||
return output
|
||||
}
|
||||
|
||||
func getDigest(index int, state []byte) (sum [Size]byte) {
|
||||
for j := 0; j < 16; j += 2 {
|
||||
for i := index*4 + j*Size; i < index*4+(j+1)*Size; i += Size {
|
||||
binary.BigEndian.PutUint32(
|
||||
sum[j*2:], binary.LittleEndian.Uint32(state[i:i+4]),
|
||||
)
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Message to send across input channel
|
||||
type blockInput struct {
|
||||
uid uint64
|
||||
msg []byte
|
||||
reset bool
|
||||
final bool
|
||||
sumCh chan [Size]byte
|
||||
}
|
||||
|
||||
// Avx512Server - Type to implement 16x parallel handling of SHA256 invocations
|
||||
type Avx512Server struct {
|
||||
blocksCh chan blockInput // Input channel
|
||||
totalIn int // Total number of inputs waiting to be processed
|
||||
lanes [16]Avx512LaneInfo // Array with info per lane (out of 16)
|
||||
digests map[uint64][Size]byte // Map of uids to (interim) digest results
|
||||
}
|
||||
|
||||
// Avx512LaneInfo - Info for each lane
|
||||
type Avx512LaneInfo struct {
|
||||
uid uint64 // unique identification for this SHA processing
|
||||
block []byte // input block to be processed
|
||||
outputCh chan [Size]byte // channel for output result
|
||||
}
|
||||
|
||||
// NewAvx512Server - Create new object for parallel processing handling
|
||||
func NewAvx512Server() *Avx512Server {
|
||||
a512srv := &Avx512Server{}
|
||||
a512srv.digests = make(map[uint64][Size]byte)
|
||||
a512srv.blocksCh = make(chan blockInput)
|
||||
|
||||
// Start a single thread for reading from the input channel
|
||||
go a512srv.Process()
|
||||
return a512srv
|
||||
}
|
||||
|
||||
// Process - Sole handler for reading from the input channel
|
||||
func (a512srv *Avx512Server) Process() {
|
||||
for {
|
||||
select {
|
||||
case block := <-a512srv.blocksCh:
|
||||
if block.reset {
|
||||
a512srv.reset(block.uid)
|
||||
continue
|
||||
}
|
||||
index := block.uid & 0xf
|
||||
// fmt.Println("Adding message:", block.uid, index)
|
||||
|
||||
if a512srv.lanes[index].block != nil { // If slot is already filled, process all inputs
|
||||
// fmt.Println("Invoking Blocks()")
|
||||
a512srv.blocks()
|
||||
}
|
||||
a512srv.totalIn++
|
||||
a512srv.lanes[index] = Avx512LaneInfo{
|
||||
uid: block.uid, block: block.msg,
|
||||
}
|
||||
if block.final {
|
||||
a512srv.lanes[index].outputCh = block.sumCh
|
||||
}
|
||||
if a512srv.totalIn == len(a512srv.lanes) {
|
||||
// fmt.Println("Invoking Blocks() while FULL: ")
|
||||
a512srv.blocks()
|
||||
}
|
||||
|
||||
// TODO: test with larger timeout
|
||||
case <-time.After(1 * time.Microsecond):
|
||||
for _, lane := range a512srv.lanes {
|
||||
if lane.block != nil { // check if there is any input to process
|
||||
// fmt.Println("Invoking Blocks() on TIMEOUT: ")
|
||||
a512srv.blocks()
|
||||
break // we are done
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Do a reset for this calculation
|
||||
func (a512srv *Avx512Server) reset(uid uint64) {
|
||||
|
||||
// Check if there is a message still waiting to be processed (and remove if so)
|
||||
for i, lane := range a512srv.lanes {
|
||||
if lane.uid == uid {
|
||||
if lane.block != nil {
|
||||
a512srv.lanes[i] = Avx512LaneInfo{} // clear message
|
||||
a512srv.totalIn--
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Delete entry from hash map
|
||||
delete(a512srv.digests, uid)
|
||||
}
|
||||
|
||||
// Invoke assembly and send results back
|
||||
func (a512srv *Avx512Server) blocks() {
|
||||
|
||||
inputs := [16][]byte{}
|
||||
for i := range inputs {
|
||||
inputs[i] = a512srv.lanes[i].block
|
||||
}
|
||||
|
||||
mask := expandMask(genMask(inputs))
|
||||
outputs := blockAvx512(a512srv.getDigests(), inputs, mask)
|
||||
|
||||
a512srv.totalIn = 0
|
||||
for i := 0; i < len(outputs); i++ {
|
||||
uid, outputCh := a512srv.lanes[i].uid, a512srv.lanes[i].outputCh
|
||||
a512srv.digests[uid] = outputs[i]
|
||||
a512srv.lanes[i] = Avx512LaneInfo{}
|
||||
|
||||
if outputCh != nil {
|
||||
// Send back result
|
||||
outputCh <- outputs[i]
|
||||
delete(a512srv.digests, uid) // Delete entry from hashmap
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (a512srv *Avx512Server) Write(uid uint64, p []byte) (nn int, err error) {
|
||||
a512srv.blocksCh <- blockInput{uid: uid, msg: p}
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
// Sum - return sha256 sum in bytes for a given sum id.
|
||||
func (a512srv *Avx512Server) Sum(uid uint64, p []byte) [32]byte {
|
||||
sumCh := make(chan [32]byte)
|
||||
a512srv.blocksCh <- blockInput{uid: uid, msg: p, final: true, sumCh: sumCh}
|
||||
return <-sumCh
|
||||
}
|
||||
|
||||
func (a512srv *Avx512Server) getDigests() *[512]byte {
|
||||
digests := [512]byte{}
|
||||
for i, lane := range a512srv.lanes {
|
||||
a, ok := a512srv.digests[lane.uid]
|
||||
if ok {
|
||||
binary.BigEndian.PutUint32(
|
||||
digests[(i+0*16)*4:], binary.LittleEndian.Uint32(a[0:4]),
|
||||
)
|
||||
binary.BigEndian.PutUint32(
|
||||
digests[(i+1*16)*4:], binary.LittleEndian.Uint32(a[4:8]),
|
||||
)
|
||||
binary.BigEndian.PutUint32(
|
||||
digests[(i+2*16)*4:],
|
||||
binary.LittleEndian.Uint32(a[8:12]),
|
||||
)
|
||||
binary.BigEndian.PutUint32(
|
||||
digests[(i+3*16)*4:],
|
||||
binary.LittleEndian.Uint32(a[12:16]),
|
||||
)
|
||||
binary.BigEndian.PutUint32(
|
||||
digests[(i+4*16)*4:],
|
||||
binary.LittleEndian.Uint32(a[16:20]),
|
||||
)
|
||||
binary.BigEndian.PutUint32(
|
||||
digests[(i+5*16)*4:],
|
||||
binary.LittleEndian.Uint32(a[20:24]),
|
||||
)
|
||||
binary.BigEndian.PutUint32(
|
||||
digests[(i+6*16)*4:],
|
||||
binary.LittleEndian.Uint32(a[24:28]),
|
||||
)
|
||||
binary.BigEndian.PutUint32(
|
||||
digests[(i+7*16)*4:],
|
||||
binary.LittleEndian.Uint32(a[28:32]),
|
||||
)
|
||||
} else {
|
||||
binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0)
|
||||
binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1)
|
||||
binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2)
|
||||
binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3)
|
||||
binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4)
|
||||
binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5)
|
||||
binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6)
|
||||
binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7)
|
||||
}
|
||||
}
|
||||
return &digests
|
||||
}
|
||||
|
||||
// Helper struct for sorting blocks based on length
|
||||
type lane struct {
|
||||
len uint
|
||||
pos uint
|
||||
}
|
||||
|
||||
type lanes []lane
|
||||
|
||||
func (lns lanes) Len() int { return len(lns) }
|
||||
func (lns lanes) Swap(i, j int) { lns[i], lns[j] = lns[j], lns[i] }
|
||||
func (lns lanes) Less(i, j int) bool { return lns[i].len < lns[j].len }
|
||||
|
||||
// Helper struct for
|
||||
type maskRounds struct {
|
||||
mask uint64
|
||||
rounds uint64
|
||||
}
|
||||
|
||||
func genMask(input [16][]byte) [16]maskRounds {
|
||||
|
||||
// Sort on blocks length small to large
|
||||
var sorted [16]lane
|
||||
for c, inpt := range input {
|
||||
sorted[c] = lane{uint(len(inpt)), uint(c)}
|
||||
}
|
||||
sort.Sort(lanes(sorted[:]))
|
||||
|
||||
// Create mask array including 'rounds' between masks
|
||||
m, round, index := uint64(0xffff), uint64(0), 0
|
||||
var mr [16]maskRounds
|
||||
for _, s := range sorted {
|
||||
if s.len > 0 {
|
||||
if uint64(s.len)>>6 > round {
|
||||
mr[index] = maskRounds{m, (uint64(s.len) >> 6) - round}
|
||||
index++
|
||||
}
|
||||
round = uint64(s.len) >> 6
|
||||
}
|
||||
m = m & ^(1 << uint(s.pos))
|
||||
}
|
||||
|
||||
return mr
|
||||
}
|
||||
|
||||
// TODO: remove function
|
||||
func expandMask(mr [16]maskRounds) []uint64 {
|
||||
size := uint64(0)
|
||||
for _, r := range mr {
|
||||
size += r.rounds
|
||||
}
|
||||
result, index := make([]uint64, size), 0
|
||||
for _, r := range mr {
|
||||
for j := uint64(0); j < r.rounds; j++ {
|
||||
result[index] = r.mask
|
||||
index++
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@@ -1,545 +0,0 @@
|
||||
//go:build !noasm && !appengine && gc
|
||||
// +build !noasm,!appengine,gc
|
||||
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2017 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package sha256
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/binary"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"hash"
|
||||
"reflect"
|
||||
"sync"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestGoldenAVX512(t *testing.T) {
|
||||
|
||||
if !hasAvx512 {
|
||||
// t.SkipNow()
|
||||
return
|
||||
}
|
||||
|
||||
server := NewAvx512Server()
|
||||
h512 := NewAvx512(server)
|
||||
|
||||
for _, g := range golden {
|
||||
h512.Reset()
|
||||
h512.Write([]byte(g.in))
|
||||
digest := h512.Sum([]byte{})
|
||||
s := fmt.Sprintf("%x", digest)
|
||||
if !reflect.DeepEqual(digest, g.out[:]) {
|
||||
t.Fatalf(
|
||||
"Sum256 function: sha256(%s) = %s want %s", g.in, s,
|
||||
hex.EncodeToString(g.out[:]),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func createInputs(size int) [16][]byte {
|
||||
input := [16][]byte{}
|
||||
for i := 0; i < 16; i++ {
|
||||
input[i] = make([]byte, size)
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func initDigests() *[512]byte {
|
||||
digests := [512]byte{}
|
||||
for i := 0; i < 16; i++ {
|
||||
binary.LittleEndian.PutUint32(digests[(i+0*16)*4:], init0)
|
||||
binary.LittleEndian.PutUint32(digests[(i+1*16)*4:], init1)
|
||||
binary.LittleEndian.PutUint32(digests[(i+2*16)*4:], init2)
|
||||
binary.LittleEndian.PutUint32(digests[(i+3*16)*4:], init3)
|
||||
binary.LittleEndian.PutUint32(digests[(i+4*16)*4:], init4)
|
||||
binary.LittleEndian.PutUint32(digests[(i+5*16)*4:], init5)
|
||||
binary.LittleEndian.PutUint32(digests[(i+6*16)*4:], init6)
|
||||
binary.LittleEndian.PutUint32(digests[(i+7*16)*4:], init7)
|
||||
}
|
||||
return &digests
|
||||
}
|
||||
|
||||
func testSha256Avx512(t *testing.T, offset, padding int) [16][]byte {
|
||||
|
||||
if !hasAvx512 {
|
||||
// t.SkipNow()
|
||||
return [16][]byte{}
|
||||
}
|
||||
|
||||
l := uint(len(golden[offset].in))
|
||||
extraBlock := uint(0)
|
||||
if padding == 0 {
|
||||
extraBlock += 9
|
||||
} else {
|
||||
extraBlock += 64
|
||||
}
|
||||
input := createInputs(int(l + extraBlock))
|
||||
for i := 0; i < 16; i++ {
|
||||
copy(input[i], golden[offset+i].in)
|
||||
input[i][l] = 0x80
|
||||
copy(input[i][l+1:], bytes.Repeat([]byte{0}, padding))
|
||||
|
||||
// Length in bits.
|
||||
len := uint64(l)
|
||||
len <<= 3
|
||||
for ii := uint(0); ii < 8; ii++ {
|
||||
input[i][l+1+uint(padding)+ii] = byte(len >> (56 - 8*ii))
|
||||
}
|
||||
}
|
||||
mask := make([]uint64, len(input[0])>>6)
|
||||
for m := range mask {
|
||||
mask[m] = 0xffff
|
||||
}
|
||||
output := blockAvx512(initDigests(), input, mask)
|
||||
for i := 0; i < 16; i++ {
|
||||
if bytes.Compare(output[i][:], golden[offset+i].out[:]) != 0 {
|
||||
t.Fatalf(
|
||||
"Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in,
|
||||
hex.EncodeToString(output[i][:]),
|
||||
hex.EncodeToString(golden[offset+i].out[:]),
|
||||
)
|
||||
}
|
||||
}
|
||||
return input
|
||||
}
|
||||
|
||||
func TestAvx512_1Block(t *testing.T) { testSha256Avx512(t, 31, 0) }
|
||||
func TestAvx512_3Blocks(t *testing.T) { testSha256Avx512(t, 47, 55) }
|
||||
|
||||
func TestAvx512_MixedBlocks(t *testing.T) {
|
||||
|
||||
if !hasAvx512 {
|
||||
// t.SkipNow()
|
||||
return
|
||||
}
|
||||
|
||||
inputSingleBlock := testSha256Avx512(t, 31, 0)
|
||||
inputMultiBlock := testSha256Avx512(t, 47, 55)
|
||||
|
||||
input := [16][]byte{}
|
||||
|
||||
for i := range input {
|
||||
if i%2 == 0 {
|
||||
input[i] = inputMultiBlock[i]
|
||||
} else {
|
||||
input[i] = inputSingleBlock[i]
|
||||
}
|
||||
}
|
||||
|
||||
mask := [3]uint64{0xffff, 0x5555, 0x5555}
|
||||
output := blockAvx512(initDigests(), input, mask[:])
|
||||
var offset int
|
||||
for i := 0; i < len(output); i++ {
|
||||
if i%2 == 0 {
|
||||
offset = 47
|
||||
} else {
|
||||
offset = 31
|
||||
}
|
||||
if bytes.Compare(output[i][:], golden[offset+i].out[:]) != 0 {
|
||||
t.Fatalf(
|
||||
"Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in,
|
||||
hex.EncodeToString(output[i][:]),
|
||||
hex.EncodeToString(golden[offset+i].out[:]),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAvx512_MixedWithNilBlocks(t *testing.T) {
|
||||
|
||||
if !hasAvx512 {
|
||||
// t.SkipNow()
|
||||
return
|
||||
}
|
||||
|
||||
inputSingleBlock := testSha256Avx512(t, 31, 0)
|
||||
inputMultiBlock := testSha256Avx512(t, 47, 55)
|
||||
|
||||
input := [16][]byte{}
|
||||
|
||||
for i := range input {
|
||||
if i%3 == 0 {
|
||||
input[i] = inputMultiBlock[i]
|
||||
} else if i%3 == 1 {
|
||||
input[i] = inputSingleBlock[i]
|
||||
} else {
|
||||
input[i] = nil
|
||||
}
|
||||
}
|
||||
|
||||
mask := [3]uint64{0xb6db, 0x9249, 0x9249}
|
||||
output := blockAvx512(initDigests(), input, mask[:])
|
||||
var offset int
|
||||
for i := 0; i < len(output); i++ {
|
||||
if i%3 == 2 { // for nil inputs
|
||||
initvec := [32]byte{
|
||||
0x6a, 0x09, 0xe6, 0x67, 0xbb, 0x67, 0xae, 0x85,
|
||||
0x3c, 0x6e, 0xf3, 0x72, 0xa5, 0x4f, 0xf5, 0x3a,
|
||||
0x51, 0x0e, 0x52, 0x7f, 0x9b, 0x05, 0x68, 0x8c,
|
||||
0x1f, 0x83, 0xd9, 0xab, 0x5b, 0xe0, 0xcd, 0x19,
|
||||
}
|
||||
if bytes.Compare(output[i][:], initvec[:]) != 0 {
|
||||
t.Fatalf(
|
||||
"Sum256 function: sha256 for nil vector = %s want %s",
|
||||
hex.EncodeToString(output[i][:]),
|
||||
hex.EncodeToString(initvec[:]),
|
||||
)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if i%3 == 0 {
|
||||
offset = 47
|
||||
} else {
|
||||
offset = 31
|
||||
}
|
||||
if bytes.Compare(output[i][:], golden[offset+i].out[:]) != 0 {
|
||||
t.Fatalf(
|
||||
"Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in,
|
||||
hex.EncodeToString(output[i][:]),
|
||||
hex.EncodeToString(golden[offset+i].out[:]),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestAvx512Server(t *testing.T) {
|
||||
|
||||
if !hasAvx512 {
|
||||
// t.SkipNow()
|
||||
return
|
||||
}
|
||||
|
||||
const offset = 31 + 16
|
||||
server := NewAvx512Server()
|
||||
|
||||
// First block of 64 bytes
|
||||
for i := 0; i < 16; i++ {
|
||||
input := make([]byte, 64)
|
||||
copy(input, golden[offset+i].in)
|
||||
server.Write(uint64(Avx512ServerUID+i), input)
|
||||
}
|
||||
|
||||
// Second block of 64 bytes
|
||||
for i := 0; i < 16; i++ {
|
||||
input := make([]byte, 64)
|
||||
copy(input, golden[offset+i].in[64:])
|
||||
server.Write(uint64(Avx512ServerUID+i), input)
|
||||
}
|
||||
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(16)
|
||||
|
||||
// Third and final block
|
||||
for i := 0; i < 16; i++ {
|
||||
input := make([]byte, 64)
|
||||
input[0] = 0x80
|
||||
copy(input[1:], bytes.Repeat([]byte{0}, 63-8))
|
||||
|
||||
// Length in bits.
|
||||
len := uint64(128)
|
||||
len <<= 3
|
||||
for ii := uint(0); ii < 8; ii++ {
|
||||
input[63-8+1+ii] = byte(len >> (56 - 8*ii))
|
||||
}
|
||||
go func(i int, uid uint64, input []byte) {
|
||||
output := server.Sum(uid, input)
|
||||
if bytes.Compare(output[:], golden[offset+i].out[:]) != 0 {
|
||||
t.Fatalf(
|
||||
"Sum256 function: sha256(%s) = %s want %s",
|
||||
golden[offset+i].in,
|
||||
hex.EncodeToString(output[:]),
|
||||
hex.EncodeToString(golden[offset+i].out[:]),
|
||||
)
|
||||
}
|
||||
wg.Done()
|
||||
}(i, uint64(Avx512ServerUID+i), input)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func TestAvx512Digest(t *testing.T) {
|
||||
|
||||
if !hasAvx512 {
|
||||
// t.SkipNow()
|
||||
return
|
||||
}
|
||||
|
||||
server := NewAvx512Server()
|
||||
|
||||
const tests = 16
|
||||
h512 := [16]hash.Hash{}
|
||||
for i := 0; i < tests; i++ {
|
||||
h512[i] = NewAvx512(server)
|
||||
}
|
||||
|
||||
const offset = 31 + 16
|
||||
for i := 0; i < tests; i++ {
|
||||
input := make([]byte, 64)
|
||||
copy(input, golden[offset+i].in)
|
||||
h512[i].Write(input)
|
||||
}
|
||||
for i := 0; i < tests; i++ {
|
||||
input := make([]byte, 64)
|
||||
copy(input, golden[offset+i].in[64:])
|
||||
h512[i].Write(input)
|
||||
}
|
||||
for i := 0; i < tests; i++ {
|
||||
output := h512[i].Sum([]byte{})
|
||||
if bytes.Compare(output[:], golden[offset+i].out[:]) != 0 {
|
||||
t.Fatalf(
|
||||
"Sum256 function: sha256(%s) = %s want %s", golden[offset+i].in,
|
||||
hex.EncodeToString(output[:]),
|
||||
hex.EncodeToString(golden[offset+i].out[:]),
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func benchmarkAvx512SingleCore(h512 []hash.Hash, body []byte) {
|
||||
|
||||
for i := 0; i < len(h512); i++ {
|
||||
h512[i].Write(body)
|
||||
}
|
||||
for i := 0; i < len(h512); i++ {
|
||||
_ = h512[i].Sum([]byte{})
|
||||
}
|
||||
}
|
||||
|
||||
func benchmarkAvx512(b *testing.B, size int) {
|
||||
|
||||
if !hasAvx512 {
|
||||
b.SkipNow()
|
||||
return
|
||||
}
|
||||
|
||||
server := NewAvx512Server()
|
||||
|
||||
const tests = 16
|
||||
body := make([]byte, size)
|
||||
|
||||
b.SetBytes(int64(len(body) * tests))
|
||||
b.ResetTimer()
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
h512 := make([]hash.Hash, tests)
|
||||
for i := 0; i < tests; i++ {
|
||||
h512[i] = NewAvx512(server)
|
||||
}
|
||||
|
||||
benchmarkAvx512SingleCore(h512, body)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAvx512_05M(b *testing.B) { benchmarkAvx512(b, 512*1024) }
|
||||
func BenchmarkAvx512_1M(b *testing.B) { benchmarkAvx512(b, 1*1024*1024) }
|
||||
func BenchmarkAvx512_5M(b *testing.B) { benchmarkAvx512(b, 5*1024*1024) }
|
||||
func BenchmarkAvx512_10M(b *testing.B) { benchmarkAvx512(b, 10*1024*1024) }
|
||||
|
||||
func benchmarkAvx512MultiCore(b *testing.B, size, cores int) {
|
||||
|
||||
if !hasAvx512 {
|
||||
b.SkipNow()
|
||||
return
|
||||
}
|
||||
|
||||
servers := make([]*Avx512Server, cores)
|
||||
for c := 0; c < cores; c++ {
|
||||
servers[c] = NewAvx512Server()
|
||||
}
|
||||
|
||||
const tests = 16
|
||||
|
||||
body := make([]byte, size)
|
||||
|
||||
h512 := make([]hash.Hash, tests*cores)
|
||||
for i := 0; i < tests*cores; i++ {
|
||||
h512[i] = NewAvx512(servers[i>>4])
|
||||
}
|
||||
|
||||
b.SetBytes(int64(size * 16 * cores))
|
||||
b.ResetTimer()
|
||||
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for i := 0; i < b.N; i++ {
|
||||
wg.Add(cores)
|
||||
for c := 0; c < cores; c++ {
|
||||
go func(c int) {
|
||||
benchmarkAvx512SingleCore(
|
||||
h512[c*tests:(c+1)*tests],
|
||||
body,
|
||||
)
|
||||
wg.Done()
|
||||
}(c)
|
||||
}
|
||||
wg.Wait()
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkAvx512_5M_2Cores(b *testing.B) {
|
||||
benchmarkAvx512MultiCore(
|
||||
b, 5*1024*1024, 2,
|
||||
)
|
||||
}
|
||||
func BenchmarkAvx512_5M_4Cores(b *testing.B) {
|
||||
benchmarkAvx512MultiCore(
|
||||
b, 5*1024*1024, 4,
|
||||
)
|
||||
}
|
||||
func BenchmarkAvx512_5M_6Cores(b *testing.B) {
|
||||
benchmarkAvx512MultiCore(
|
||||
b, 5*1024*1024, 6,
|
||||
)
|
||||
}
|
||||
|
||||
type maskTest struct {
|
||||
in [16]int
|
||||
out [16]maskRounds
|
||||
}
|
||||
|
||||
var goldenMask = []maskTest{
|
||||
{[16]int{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, [16]maskRounds{}},
|
||||
{
|
||||
[16]int{64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0},
|
||||
[16]maskRounds{{0x5555, 1}},
|
||||
},
|
||||
{
|
||||
[16]int{0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64, 0, 64},
|
||||
[16]maskRounds{{0xaaaa, 1}},
|
||||
},
|
||||
{
|
||||
[16]int{64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64},
|
||||
[16]maskRounds{{0xffff, 1}},
|
||||
},
|
||||
{
|
||||
[16]int{
|
||||
128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128,
|
||||
128, 128, 128,
|
||||
},
|
||||
[16]maskRounds{{0xffff, 2}},
|
||||
},
|
||||
{
|
||||
[16]int{
|
||||
64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64,
|
||||
128,
|
||||
},
|
||||
[16]maskRounds{{0xffff, 1}, {0xaaaa, 1}},
|
||||
},
|
||||
{
|
||||
[16]int{
|
||||
128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128, 64, 128,
|
||||
64,
|
||||
},
|
||||
[16]maskRounds{{0xffff, 1}, {0x5555, 1}},
|
||||
},
|
||||
{
|
||||
[16]int{
|
||||
64, 192, 64, 192, 64, 192, 64, 192, 64, 192, 64, 192, 64, 192, 64,
|
||||
192,
|
||||
},
|
||||
[16]maskRounds{{0xffff, 1}, {0xaaaa, 2}},
|
||||
},
|
||||
//
|
||||
// >= 64 0110=6 1011=b 1101=d 0110=6
|
||||
// >=128 0100=4 0010=2 1001=9 0100=4
|
||||
{
|
||||
[16]int{0, 64, 128, 0, 64, 128, 0, 64, 128, 0, 64, 128, 0, 64, 128, 0},
|
||||
[16]maskRounds{{0x6db6, 1}, {0x4924, 1}},
|
||||
},
|
||||
{
|
||||
[16]int{
|
||||
1 * 64, 2 * 64, 3 * 64, 4 * 64, 5 * 64, 6 * 64, 7 * 64, 8 * 64,
|
||||
9 * 64, 10 * 64,
|
||||
11 * 64, 12 * 64, 13 * 64, 14 * 64, 15 * 64, 16 * 64,
|
||||
},
|
||||
[16]maskRounds{
|
||||
{0xffff, 1}, {0xfffe, 1}, {0xfffc, 1}, {0xfff8, 1}, {0xfff0, 1},
|
||||
{0xffe0, 1}, {0xffc0, 1}, {0xff80, 1},
|
||||
{0xff00, 1}, {0xfe00, 1}, {0xfc00, 1}, {0xf800, 1}, {0xf000, 1},
|
||||
{0xe000, 1},
|
||||
{0xc000, 1}, {0x8000, 1},
|
||||
},
|
||||
},
|
||||
{
|
||||
[16]int{
|
||||
2 * 64, 1 * 64, 3 * 64, 4 * 64, 5 * 64, 6 * 64, 7 * 64, 8 * 64,
|
||||
9 * 64, 10 * 64,
|
||||
11 * 64, 12 * 64, 13 * 64, 14 * 64, 15 * 64, 16 * 64,
|
||||
},
|
||||
[16]maskRounds{
|
||||
{0xffff, 1}, {0xfffd, 1}, {0xfffc, 1}, {0xfff8, 1}, {0xfff0, 1},
|
||||
{0xffe0, 1}, {0xffc0, 1}, {0xff80, 1},
|
||||
{0xff00, 1}, {0xfe00, 1}, {0xfc00, 1}, {0xf800, 1}, {0xf000, 1},
|
||||
{0xe000, 1},
|
||||
{0xc000, 1}, {0x8000, 1},
|
||||
},
|
||||
},
|
||||
{
|
||||
[16]int{
|
||||
10 * 64, 20 * 64, 30 * 64, 40 * 64, 50 * 64, 60 * 64, 70 * 64,
|
||||
80 * 64, 90 * 64,
|
||||
100 * 64, 110 * 64, 120 * 64, 130 * 64, 140 * 64, 150 * 64,
|
||||
160 * 64,
|
||||
},
|
||||
[16]maskRounds{
|
||||
{0xffff, 10}, {0xfffe, 10}, {0xfffc, 10}, {0xfff8, 10},
|
||||
{0xfff0, 10},
|
||||
{0xffe0, 10}, {0xffc0, 10}, {0xff80, 10},
|
||||
{0xff00, 10}, {0xfe00, 10}, {0xfc00, 10}, {0xf800, 10},
|
||||
{0xf000, 10}, {0xe000, 10},
|
||||
{0xc000, 10}, {0x8000, 10},
|
||||
},
|
||||
},
|
||||
{
|
||||
[16]int{
|
||||
10 * 64, 19 * 64, 27 * 64, 34 * 64, 40 * 64, 45 * 64, 49 * 64,
|
||||
52 * 64, 54 * 64,
|
||||
55 * 64, 57 * 64, 60 * 64, 64 * 64, 69 * 64, 75 * 64, 82 * 64,
|
||||
},
|
||||
[16]maskRounds{
|
||||
{0xffff, 10}, {0xfffe, 9}, {0xfffc, 8}, {0xfff8, 7}, {0xfff0, 6},
|
||||
{0xffe0, 5}, {0xffc0, 4}, {0xff80, 3},
|
||||
{0xff00, 2}, {0xfe00, 1}, {0xfc00, 2}, {0xf800, 3}, {0xf000, 4},
|
||||
{0xe000, 5},
|
||||
{0xc000, 6}, {0x8000, 7},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
func TestMaskGen(t *testing.T) {
|
||||
input := [16][]byte{}
|
||||
for gcase, g := range goldenMask {
|
||||
for i, l := range g.in {
|
||||
buf := make([]byte, l)
|
||||
input[i] = buf[:]
|
||||
}
|
||||
|
||||
mr := genMask(input)
|
||||
|
||||
if !reflect.DeepEqual(mr, g.out) {
|
||||
t.Fatalf(
|
||||
"case %d: got %04x\n want %04x", gcase, mr,
|
||||
g.out,
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,31 +0,0 @@
|
||||
//go:build !noasm && !appengine && gc
|
||||
// +build !noasm,!appengine,gc
|
||||
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package sha256
|
||||
|
||||
func blockArmSha2Go(dig *digest, p []byte) {
|
||||
panic("blockArmSha2Go called unexpectedly")
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func blockIntelSha(h *[8]uint32, message []uint8)
|
||||
|
||||
func blockIntelShaGo(dig *digest, p []byte) {
|
||||
blockIntelSha(&dig.h, p)
|
||||
}
|
||||
@@ -1,266 +0,0 @@
|
||||
//+build !noasm,!appengine,gc
|
||||
|
||||
// SHA intrinsic version of SHA256
|
||||
|
||||
// Kristofer Peterson, (C) 2018.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
#include "textflag.h"
|
||||
|
||||
DATA K<>+0x00(SB)/4, $0x428a2f98
|
||||
DATA K<>+0x04(SB)/4, $0x71374491
|
||||
DATA K<>+0x08(SB)/4, $0xb5c0fbcf
|
||||
DATA K<>+0x0c(SB)/4, $0xe9b5dba5
|
||||
DATA K<>+0x10(SB)/4, $0x3956c25b
|
||||
DATA K<>+0x14(SB)/4, $0x59f111f1
|
||||
DATA K<>+0x18(SB)/4, $0x923f82a4
|
||||
DATA K<>+0x1c(SB)/4, $0xab1c5ed5
|
||||
DATA K<>+0x20(SB)/4, $0xd807aa98
|
||||
DATA K<>+0x24(SB)/4, $0x12835b01
|
||||
DATA K<>+0x28(SB)/4, $0x243185be
|
||||
DATA K<>+0x2c(SB)/4, $0x550c7dc3
|
||||
DATA K<>+0x30(SB)/4, $0x72be5d74
|
||||
DATA K<>+0x34(SB)/4, $0x80deb1fe
|
||||
DATA K<>+0x38(SB)/4, $0x9bdc06a7
|
||||
DATA K<>+0x3c(SB)/4, $0xc19bf174
|
||||
DATA K<>+0x40(SB)/4, $0xe49b69c1
|
||||
DATA K<>+0x44(SB)/4, $0xefbe4786
|
||||
DATA K<>+0x48(SB)/4, $0x0fc19dc6
|
||||
DATA K<>+0x4c(SB)/4, $0x240ca1cc
|
||||
DATA K<>+0x50(SB)/4, $0x2de92c6f
|
||||
DATA K<>+0x54(SB)/4, $0x4a7484aa
|
||||
DATA K<>+0x58(SB)/4, $0x5cb0a9dc
|
||||
DATA K<>+0x5c(SB)/4, $0x76f988da
|
||||
DATA K<>+0x60(SB)/4, $0x983e5152
|
||||
DATA K<>+0x64(SB)/4, $0xa831c66d
|
||||
DATA K<>+0x68(SB)/4, $0xb00327c8
|
||||
DATA K<>+0x6c(SB)/4, $0xbf597fc7
|
||||
DATA K<>+0x70(SB)/4, $0xc6e00bf3
|
||||
DATA K<>+0x74(SB)/4, $0xd5a79147
|
||||
DATA K<>+0x78(SB)/4, $0x06ca6351
|
||||
DATA K<>+0x7c(SB)/4, $0x14292967
|
||||
DATA K<>+0x80(SB)/4, $0x27b70a85
|
||||
DATA K<>+0x84(SB)/4, $0x2e1b2138
|
||||
DATA K<>+0x88(SB)/4, $0x4d2c6dfc
|
||||
DATA K<>+0x8c(SB)/4, $0x53380d13
|
||||
DATA K<>+0x90(SB)/4, $0x650a7354
|
||||
DATA K<>+0x94(SB)/4, $0x766a0abb
|
||||
DATA K<>+0x98(SB)/4, $0x81c2c92e
|
||||
DATA K<>+0x9c(SB)/4, $0x92722c85
|
||||
DATA K<>+0xa0(SB)/4, $0xa2bfe8a1
|
||||
DATA K<>+0xa4(SB)/4, $0xa81a664b
|
||||
DATA K<>+0xa8(SB)/4, $0xc24b8b70
|
||||
DATA K<>+0xac(SB)/4, $0xc76c51a3
|
||||
DATA K<>+0xb0(SB)/4, $0xd192e819
|
||||
DATA K<>+0xb4(SB)/4, $0xd6990624
|
||||
DATA K<>+0xb8(SB)/4, $0xf40e3585
|
||||
DATA K<>+0xbc(SB)/4, $0x106aa070
|
||||
DATA K<>+0xc0(SB)/4, $0x19a4c116
|
||||
DATA K<>+0xc4(SB)/4, $0x1e376c08
|
||||
DATA K<>+0xc8(SB)/4, $0x2748774c
|
||||
DATA K<>+0xcc(SB)/4, $0x34b0bcb5
|
||||
DATA K<>+0xd0(SB)/4, $0x391c0cb3
|
||||
DATA K<>+0xd4(SB)/4, $0x4ed8aa4a
|
||||
DATA K<>+0xd8(SB)/4, $0x5b9cca4f
|
||||
DATA K<>+0xdc(SB)/4, $0x682e6ff3
|
||||
DATA K<>+0xe0(SB)/4, $0x748f82ee
|
||||
DATA K<>+0xe4(SB)/4, $0x78a5636f
|
||||
DATA K<>+0xe8(SB)/4, $0x84c87814
|
||||
DATA K<>+0xec(SB)/4, $0x8cc70208
|
||||
DATA K<>+0xf0(SB)/4, $0x90befffa
|
||||
DATA K<>+0xf4(SB)/4, $0xa4506ceb
|
||||
DATA K<>+0xf8(SB)/4, $0xbef9a3f7
|
||||
DATA K<>+0xfc(SB)/4, $0xc67178f2
|
||||
GLOBL K<>(SB), RODATA|NOPTR, $256
|
||||
|
||||
DATA SHUF_MASK<>+0x00(SB)/8, $0x0405060700010203
|
||||
DATA SHUF_MASK<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
|
||||
GLOBL SHUF_MASK<>(SB), RODATA|NOPTR, $16
|
||||
|
||||
// Register Usage
|
||||
// BX base address of constant table (constant)
|
||||
// DX hash_state (constant)
|
||||
// SI hash_data.data
|
||||
// DI hash_data.data + hash_data.length - 64 (constant)
|
||||
// X0 scratch
|
||||
// X1 scratch
|
||||
// X2 working hash state // ABEF
|
||||
// X3 working hash state // CDGH
|
||||
// X4 first 16 bytes of block
|
||||
// X5 second 16 bytes of block
|
||||
// X6 third 16 bytes of block
|
||||
// X7 fourth 16 bytes of block
|
||||
// X12 saved hash state // ABEF
|
||||
// X13 saved hash state // CDGH
|
||||
// X15 data shuffle mask (constant)
|
||||
|
||||
TEXT ·blockIntelSha(SB), NOSPLIT, $0-32
|
||||
MOVQ h+0(FP), DX
|
||||
MOVQ message_base+8(FP), SI
|
||||
MOVQ message_len+16(FP), DI
|
||||
LEAQ -64(SI)(DI*1), DI
|
||||
MOVOU (DX), X2
|
||||
MOVOU 16(DX), X1
|
||||
MOVO X2, X3
|
||||
PUNPCKLLQ X1, X2
|
||||
PUNPCKHLQ X1, X3
|
||||
PSHUFD $0x27, X2, X2
|
||||
PSHUFD $0x27, X3, X3
|
||||
MOVO SHUF_MASK<>(SB), X15
|
||||
LEAQ K<>(SB), BX
|
||||
|
||||
JMP TEST
|
||||
|
||||
LOOP:
|
||||
MOVO X2, X12
|
||||
MOVO X3, X13
|
||||
|
||||
// load block and shuffle
|
||||
MOVOU (SI), X4
|
||||
MOVOU 16(SI), X5
|
||||
MOVOU 32(SI), X6
|
||||
MOVOU 48(SI), X7
|
||||
PSHUFB X15, X4
|
||||
PSHUFB X15, X5
|
||||
PSHUFB X15, X6
|
||||
PSHUFB X15, X7
|
||||
|
||||
#define ROUND456 \
|
||||
PADDL X5, X0 \
|
||||
LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
|
||||
MOVO X5, X1 \
|
||||
LONG $0x0f3a0f66; WORD $0x04cc \ // PALIGNR XMM1, XMM4, 4
|
||||
PADDL X1, X6 \
|
||||
LONG $0xf5cd380f \ // SHA256MSG2 XMM6, XMM5
|
||||
PSHUFD $0x4e, X0, X0 \
|
||||
LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
|
||||
LONG $0xe5cc380f // SHA256MSG1 XMM4, XMM5
|
||||
|
||||
#define ROUND567 \
|
||||
PADDL X6, X0 \
|
||||
LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
|
||||
MOVO X6, X1 \
|
||||
LONG $0x0f3a0f66; WORD $0x04cd \ // PALIGNR XMM1, XMM5, 4
|
||||
PADDL X1, X7 \
|
||||
LONG $0xfecd380f \ // SHA256MSG2 XMM7, XMM6
|
||||
PSHUFD $0x4e, X0, X0 \
|
||||
LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
|
||||
LONG $0xeecc380f // SHA256MSG1 XMM5, XMM6
|
||||
|
||||
#define ROUND674 \
|
||||
PADDL X7, X0 \
|
||||
LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
|
||||
MOVO X7, X1 \
|
||||
LONG $0x0f3a0f66; WORD $0x04ce \ // PALIGNR XMM1, XMM6, 4
|
||||
PADDL X1, X4 \
|
||||
LONG $0xe7cd380f \ // SHA256MSG2 XMM4, XMM7
|
||||
PSHUFD $0x4e, X0, X0 \
|
||||
LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
|
||||
LONG $0xf7cc380f // SHA256MSG1 XMM6, XMM7
|
||||
|
||||
#define ROUND745 \
|
||||
PADDL X4, X0 \
|
||||
LONG $0xdacb380f \ // SHA256RNDS2 XMM3, XMM2
|
||||
MOVO X4, X1 \
|
||||
LONG $0x0f3a0f66; WORD $0x04cf \ // PALIGNR XMM1, XMM7, 4
|
||||
PADDL X1, X5 \
|
||||
LONG $0xeccd380f \ // SHA256MSG2 XMM5, XMM4
|
||||
PSHUFD $0x4e, X0, X0 \
|
||||
LONG $0xd3cb380f \ // SHA256RNDS2 XMM2, XMM3
|
||||
LONG $0xfccc380f // SHA256MSG1 XMM7, XMM4
|
||||
|
||||
// rounds 0-3
|
||||
MOVO (BX), X0
|
||||
PADDL X4, X0
|
||||
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
|
||||
PSHUFD $0x4e, X0, X0
|
||||
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
|
||||
|
||||
// rounds 4-7
|
||||
MOVO 1*16(BX), X0
|
||||
PADDL X5, X0
|
||||
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
|
||||
PSHUFD $0x4e, X0, X0
|
||||
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
|
||||
LONG $0xe5cc380f // SHA256MSG1 XMM4, XMM5
|
||||
|
||||
// rounds 8-11
|
||||
MOVO 2*16(BX), X0
|
||||
PADDL X6, X0
|
||||
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
|
||||
PSHUFD $0x4e, X0, X0
|
||||
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
|
||||
LONG $0xeecc380f // SHA256MSG1 XMM5, XMM6
|
||||
|
||||
MOVO 3*16(BX), X0; ROUND674 // rounds 12-15
|
||||
MOVO 4*16(BX), X0; ROUND745 // rounds 16-19
|
||||
MOVO 5*16(BX), X0; ROUND456 // rounds 20-23
|
||||
MOVO 6*16(BX), X0; ROUND567 // rounds 24-27
|
||||
MOVO 7*16(BX), X0; ROUND674 // rounds 28-31
|
||||
MOVO 8*16(BX), X0; ROUND745 // rounds 32-35
|
||||
MOVO 9*16(BX), X0; ROUND456 // rounds 36-39
|
||||
MOVO 10*16(BX), X0; ROUND567 // rounds 40-43
|
||||
MOVO 11*16(BX), X0; ROUND674 // rounds 44-47
|
||||
MOVO 12*16(BX), X0; ROUND745 // rounds 48-51
|
||||
|
||||
// rounds 52-55
|
||||
MOVO 13*16(BX), X0
|
||||
PADDL X5, X0
|
||||
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
|
||||
MOVO X5, X1
|
||||
LONG $0x0f3a0f66; WORD $0x04cc // PALIGNR XMM1, XMM4, 4
|
||||
PADDL X1, X6
|
||||
LONG $0xf5cd380f // SHA256MSG2 XMM6, XMM5
|
||||
PSHUFD $0x4e, X0, X0
|
||||
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
|
||||
|
||||
// rounds 56-59
|
||||
MOVO 14*16(BX), X0
|
||||
PADDL X6, X0
|
||||
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
|
||||
MOVO X6, X1
|
||||
LONG $0x0f3a0f66; WORD $0x04cd // PALIGNR XMM1, XMM5, 4
|
||||
PADDL X1, X7
|
||||
LONG $0xfecd380f // SHA256MSG2 XMM7, XMM6
|
||||
PSHUFD $0x4e, X0, X0
|
||||
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
|
||||
|
||||
// rounds 60-63
|
||||
MOVO 15*16(BX), X0
|
||||
PADDL X7, X0
|
||||
LONG $0xdacb380f // SHA256RNDS2 XMM3, XMM2
|
||||
PSHUFD $0x4e, X0, X0
|
||||
LONG $0xd3cb380f // SHA256RNDS2 XMM2, XMM3
|
||||
|
||||
PADDL X12, X2
|
||||
PADDL X13, X3
|
||||
|
||||
ADDQ $64, SI
|
||||
|
||||
TEST:
|
||||
CMPQ SI, DI
|
||||
JBE LOOP
|
||||
|
||||
PSHUFD $0x4e, X3, X0
|
||||
LONG $0x0e3a0f66; WORD $0xf0c2 // PBLENDW XMM0, XMM2, 0xf0
|
||||
PSHUFD $0x4e, X2, X1
|
||||
LONG $0x0e3a0f66; WORD $0x0fcb // PBLENDW XMM1, XMM3, 0x0f
|
||||
PSHUFD $0x1b, X0, X0
|
||||
PSHUFD $0x1b, X1, X1
|
||||
|
||||
MOVOU X0, (DX)
|
||||
MOVOU X1, 16(DX)
|
||||
|
||||
RET
|
||||
@@ -1,78 +0,0 @@
|
||||
//go:build !noasm && !appengine && gc
|
||||
// +build !noasm,!appengine,gc
|
||||
|
||||
package sha256
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/binary"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func sha256hash(m []byte) (r [32]byte) {
|
||||
var h [8]uint32
|
||||
|
||||
h[0] = 0x6a09e667
|
||||
h[1] = 0xbb67ae85
|
||||
h[2] = 0x3c6ef372
|
||||
h[3] = 0xa54ff53a
|
||||
h[4] = 0x510e527f
|
||||
h[5] = 0x9b05688c
|
||||
h[6] = 0x1f83d9ab
|
||||
h[7] = 0x5be0cd19
|
||||
|
||||
blockIntelSha(&h, m)
|
||||
l0 := len(m)
|
||||
l := l0 & (BlockSize - 1)
|
||||
m = m[l0-l:]
|
||||
|
||||
var k [64]byte
|
||||
copy(k[:], m)
|
||||
|
||||
k[l] = 0x80
|
||||
|
||||
if l >= 56 {
|
||||
blockIntelSha(&h, k[:])
|
||||
binary.LittleEndian.PutUint64(k[0:8], 0)
|
||||
binary.LittleEndian.PutUint64(k[8:16], 0)
|
||||
binary.LittleEndian.PutUint64(k[16:24], 0)
|
||||
binary.LittleEndian.PutUint64(k[24:32], 0)
|
||||
binary.LittleEndian.PutUint64(k[32:40], 0)
|
||||
binary.LittleEndian.PutUint64(k[40:48], 0)
|
||||
binary.LittleEndian.PutUint64(k[48:56], 0)
|
||||
}
|
||||
binary.BigEndian.PutUint64(k[56:64], uint64(l0)<<3)
|
||||
blockIntelSha(&h, k[:])
|
||||
|
||||
binary.BigEndian.PutUint32(r[0:4], h[0])
|
||||
binary.BigEndian.PutUint32(r[4:8], h[1])
|
||||
binary.BigEndian.PutUint32(r[8:12], h[2])
|
||||
binary.BigEndian.PutUint32(r[12:16], h[3])
|
||||
binary.BigEndian.PutUint32(r[16:20], h[4])
|
||||
binary.BigEndian.PutUint32(r[20:24], h[5])
|
||||
binary.BigEndian.PutUint32(r[24:28], h[6])
|
||||
binary.BigEndian.PutUint32(r[28:32], h[7])
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
func runTestSha(hashfunc func([]byte) [32]byte) bool {
|
||||
var m = []byte("This is a message. This is a message. This is a message. This is a message.")
|
||||
|
||||
ar := hashfunc(m)
|
||||
br := sha256.Sum256(m)
|
||||
|
||||
return ar == br
|
||||
}
|
||||
|
||||
func TestSha0(t *testing.T) {
|
||||
if !runTestSha(Sum256) {
|
||||
t.Errorf("FAILED")
|
||||
}
|
||||
}
|
||||
|
||||
func TestSha1(t *testing.T) {
|
||||
if hasIntelSha && !runTestSha(sha256hash) {
|
||||
t.Errorf("FAILED")
|
||||
}
|
||||
}
|
||||
@@ -1,40 +0,0 @@
|
||||
//go:build !noasm && !appengine && gc
|
||||
// +build !noasm,!appengine,gc
|
||||
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package sha256
|
||||
|
||||
func blockIntelShaGo(dig *digest, p []byte) {
|
||||
panic("blockIntelShaGo called unexpectedly")
|
||||
}
|
||||
|
||||
//go:noescape
|
||||
func blockArmSha2(h []uint32, message []uint8)
|
||||
|
||||
func blockArmSha2Go(dig *digest, p []byte) {
|
||||
|
||||
h := []uint32{
|
||||
dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6],
|
||||
dig.h[7],
|
||||
}
|
||||
|
||||
blockArmSha2(h[:], p[:])
|
||||
|
||||
dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4],
|
||||
h[5], h[6], h[7]
|
||||
}
|
||||
@@ -1,192 +0,0 @@
|
||||
//+build !noasm,!appengine,gc
|
||||
|
||||
// ARM64 version of SHA256
|
||||
|
||||
//
|
||||
// Minio Cloud Storage, (C) 2016 Minio, Inc.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
//
|
||||
|
||||
//
|
||||
// Based on implementation as found in https://github.com/jocover/sha256-armv8
|
||||
//
|
||||
// Use github.com/minio/asm2plan9s on this file to assemble ARM instructions to
|
||||
// their Plan9 equivalents
|
||||
//
|
||||
|
||||
TEXT ·blockArmSha2(SB), 7, $0
|
||||
MOVD h+0(FP), R0
|
||||
MOVD message+24(FP), R1
|
||||
MOVD message_len+32(FP), R2 // length of message
|
||||
SUBS $64, R2
|
||||
BMI complete
|
||||
|
||||
// Load constants table pointer
|
||||
MOVD $·constants(SB), R3
|
||||
|
||||
// Cache constants table in registers v16 - v31
|
||||
WORD $0x4cdf2870 // ld1 {v16.4s-v19.4s}, [x3], #64
|
||||
WORD $0x4cdf7800 // ld1 {v0.4s}, [x0], #16
|
||||
WORD $0x4cdf2874 // ld1 {v20.4s-v23.4s}, [x3], #64
|
||||
|
||||
WORD $0x4c407801 // ld1 {v1.4s}, [x0]
|
||||
WORD $0x4cdf2878 // ld1 {v24.4s-v27.4s}, [x3], #64
|
||||
WORD $0xd1004000 // sub x0, x0, #0x10
|
||||
WORD $0x4cdf287c // ld1 {v28.4s-v31.4s}, [x3], #64
|
||||
|
||||
loop:
|
||||
// Main loop
|
||||
WORD $0x4cdf2025 // ld1 {v5.16b-v8.16b}, [x1], #64
|
||||
WORD $0x4ea01c02 // mov v2.16b, v0.16b
|
||||
WORD $0x4ea11c23 // mov v3.16b, v1.16b
|
||||
WORD $0x6e2008a5 // rev32 v5.16b, v5.16b
|
||||
WORD $0x6e2008c6 // rev32 v6.16b, v6.16b
|
||||
WORD $0x4eb084a9 // add v9.4s, v5.4s, v16.4s
|
||||
WORD $0x6e2008e7 // rev32 v7.16b, v7.16b
|
||||
WORD $0x4eb184ca // add v10.4s, v6.4s, v17.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e094062 // sha256h q2, q3, v9.4s
|
||||
WORD $0x5e095083 // sha256h2 q3, q4, v9.4s
|
||||
WORD $0x5e2828c5 // sha256su0 v5.4s, v6.4s
|
||||
WORD $0x6e200908 // rev32 v8.16b, v8.16b
|
||||
WORD $0x4eb284e9 // add v9.4s, v7.4s, v18.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e0a4062 // sha256h q2, q3, v10.4s
|
||||
WORD $0x5e0a5083 // sha256h2 q3, q4, v10.4s
|
||||
WORD $0x5e2828e6 // sha256su0 v6.4s, v7.4s
|
||||
WORD $0x5e0860e5 // sha256su1 v5.4s, v7.4s, v8.4s
|
||||
WORD $0x4eb3850a // add v10.4s, v8.4s, v19.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e094062 // sha256h q2, q3, v9.4s
|
||||
WORD $0x5e095083 // sha256h2 q3, q4, v9.4s
|
||||
WORD $0x5e282907 // sha256su0 v7.4s, v8.4s
|
||||
WORD $0x5e056106 // sha256su1 v6.4s, v8.4s, v5.4s
|
||||
WORD $0x4eb484a9 // add v9.4s, v5.4s, v20.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e0a4062 // sha256h q2, q3, v10.4s
|
||||
WORD $0x5e0a5083 // sha256h2 q3, q4, v10.4s
|
||||
WORD $0x5e2828a8 // sha256su0 v8.4s, v5.4s
|
||||
WORD $0x5e0660a7 // sha256su1 v7.4s, v5.4s, v6.4s
|
||||
WORD $0x4eb584ca // add v10.4s, v6.4s, v21.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e094062 // sha256h q2, q3, v9.4s
|
||||
WORD $0x5e095083 // sha256h2 q3, q4, v9.4s
|
||||
WORD $0x5e2828c5 // sha256su0 v5.4s, v6.4s
|
||||
WORD $0x5e0760c8 // sha256su1 v8.4s, v6.4s, v7.4s
|
||||
WORD $0x4eb684e9 // add v9.4s, v7.4s, v22.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e0a4062 // sha256h q2, q3, v10.4s
|
||||
WORD $0x5e0a5083 // sha256h2 q3, q4, v10.4s
|
||||
WORD $0x5e2828e6 // sha256su0 v6.4s, v7.4s
|
||||
WORD $0x5e0860e5 // sha256su1 v5.4s, v7.4s, v8.4s
|
||||
WORD $0x4eb7850a // add v10.4s, v8.4s, v23.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e094062 // sha256h q2, q3, v9.4s
|
||||
WORD $0x5e095083 // sha256h2 q3, q4, v9.4s
|
||||
WORD $0x5e282907 // sha256su0 v7.4s, v8.4s
|
||||
WORD $0x5e056106 // sha256su1 v6.4s, v8.4s, v5.4s
|
||||
WORD $0x4eb884a9 // add v9.4s, v5.4s, v24.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e0a4062 // sha256h q2, q3, v10.4s
|
||||
WORD $0x5e0a5083 // sha256h2 q3, q4, v10.4s
|
||||
WORD $0x5e2828a8 // sha256su0 v8.4s, v5.4s
|
||||
WORD $0x5e0660a7 // sha256su1 v7.4s, v5.4s, v6.4s
|
||||
WORD $0x4eb984ca // add v10.4s, v6.4s, v25.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e094062 // sha256h q2, q3, v9.4s
|
||||
WORD $0x5e095083 // sha256h2 q3, q4, v9.4s
|
||||
WORD $0x5e2828c5 // sha256su0 v5.4s, v6.4s
|
||||
WORD $0x5e0760c8 // sha256su1 v8.4s, v6.4s, v7.4s
|
||||
WORD $0x4eba84e9 // add v9.4s, v7.4s, v26.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e0a4062 // sha256h q2, q3, v10.4s
|
||||
WORD $0x5e0a5083 // sha256h2 q3, q4, v10.4s
|
||||
WORD $0x5e2828e6 // sha256su0 v6.4s, v7.4s
|
||||
WORD $0x5e0860e5 // sha256su1 v5.4s, v7.4s, v8.4s
|
||||
WORD $0x4ebb850a // add v10.4s, v8.4s, v27.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e094062 // sha256h q2, q3, v9.4s
|
||||
WORD $0x5e095083 // sha256h2 q3, q4, v9.4s
|
||||
WORD $0x5e282907 // sha256su0 v7.4s, v8.4s
|
||||
WORD $0x5e056106 // sha256su1 v6.4s, v8.4s, v5.4s
|
||||
WORD $0x4ebc84a9 // add v9.4s, v5.4s, v28.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e0a4062 // sha256h q2, q3, v10.4s
|
||||
WORD $0x5e0a5083 // sha256h2 q3, q4, v10.4s
|
||||
WORD $0x5e2828a8 // sha256su0 v8.4s, v5.4s
|
||||
WORD $0x5e0660a7 // sha256su1 v7.4s, v5.4s, v6.4s
|
||||
WORD $0x4ebd84ca // add v10.4s, v6.4s, v29.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e094062 // sha256h q2, q3, v9.4s
|
||||
WORD $0x5e095083 // sha256h2 q3, q4, v9.4s
|
||||
WORD $0x5e0760c8 // sha256su1 v8.4s, v6.4s, v7.4s
|
||||
WORD $0x4ebe84e9 // add v9.4s, v7.4s, v30.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e0a4062 // sha256h q2, q3, v10.4s
|
||||
WORD $0x5e0a5083 // sha256h2 q3, q4, v10.4s
|
||||
WORD $0x4ebf850a // add v10.4s, v8.4s, v31.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e094062 // sha256h q2, q3, v9.4s
|
||||
WORD $0x5e095083 // sha256h2 q3, q4, v9.4s
|
||||
WORD $0x4ea21c44 // mov v4.16b, v2.16b
|
||||
WORD $0x5e0a4062 // sha256h q2, q3, v10.4s
|
||||
WORD $0x5e0a5083 // sha256h2 q3, q4, v10.4s
|
||||
WORD $0x4ea38421 // add v1.4s, v1.4s, v3.4s
|
||||
WORD $0x4ea28400 // add v0.4s, v0.4s, v2.4s
|
||||
|
||||
SUBS $64, R2
|
||||
BPL loop
|
||||
|
||||
// Store result
|
||||
WORD $0x4c00a800 // st1 {v0.4s, v1.4s}, [x0]
|
||||
|
||||
complete:
|
||||
RET
|
||||
|
||||
// Constants table
|
||||
DATA ·constants+0x0(SB)/8, $0x71374491428a2f98
|
||||
DATA ·constants+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
|
||||
DATA ·constants+0x10(SB)/8, $0x59f111f13956c25b
|
||||
DATA ·constants+0x18(SB)/8, $0xab1c5ed5923f82a4
|
||||
DATA ·constants+0x20(SB)/8, $0x12835b01d807aa98
|
||||
DATA ·constants+0x28(SB)/8, $0x550c7dc3243185be
|
||||
DATA ·constants+0x30(SB)/8, $0x80deb1fe72be5d74
|
||||
DATA ·constants+0x38(SB)/8, $0xc19bf1749bdc06a7
|
||||
DATA ·constants+0x40(SB)/8, $0xefbe4786e49b69c1
|
||||
DATA ·constants+0x48(SB)/8, $0x240ca1cc0fc19dc6
|
||||
DATA ·constants+0x50(SB)/8, $0x4a7484aa2de92c6f
|
||||
DATA ·constants+0x58(SB)/8, $0x76f988da5cb0a9dc
|
||||
DATA ·constants+0x60(SB)/8, $0xa831c66d983e5152
|
||||
DATA ·constants+0x68(SB)/8, $0xbf597fc7b00327c8
|
||||
DATA ·constants+0x70(SB)/8, $0xd5a79147c6e00bf3
|
||||
DATA ·constants+0x78(SB)/8, $0x1429296706ca6351
|
||||
DATA ·constants+0x80(SB)/8, $0x2e1b213827b70a85
|
||||
DATA ·constants+0x88(SB)/8, $0x53380d134d2c6dfc
|
||||
DATA ·constants+0x90(SB)/8, $0x766a0abb650a7354
|
||||
DATA ·constants+0x98(SB)/8, $0x92722c8581c2c92e
|
||||
DATA ·constants+0xa0(SB)/8, $0xa81a664ba2bfe8a1
|
||||
DATA ·constants+0xa8(SB)/8, $0xc76c51a3c24b8b70
|
||||
DATA ·constants+0xb0(SB)/8, $0xd6990624d192e819
|
||||
DATA ·constants+0xb8(SB)/8, $0x106aa070f40e3585
|
||||
DATA ·constants+0xc0(SB)/8, $0x1e376c0819a4c116
|
||||
DATA ·constants+0xc8(SB)/8, $0x34b0bcb52748774c
|
||||
DATA ·constants+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
|
||||
DATA ·constants+0xd8(SB)/8, $0x682e6ff35b9cca4f
|
||||
DATA ·constants+0xe0(SB)/8, $0x78a5636f748f82ee
|
||||
DATA ·constants+0xe8(SB)/8, $0x8cc7020884c87814
|
||||
DATA ·constants+0xf0(SB)/8, $0xa4506ceb90befffa
|
||||
DATA ·constants+0xf8(SB)/8, $0xc67178f2bef9a3f7
|
||||
|
||||
GLOBL ·constants(SB), 8, $256
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
//go:build appengine || noasm || (!amd64 && !arm64) || !gc
|
||||
// +build appengine noasm !amd64,!arm64 !gc
|
||||
|
||||
/*
|
||||
* Minio Cloud Storage, (C) 2019 Minio, Inc.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
package sha256
|
||||
|
||||
func blockIntelShaGo(dig *digest, p []byte) {
|
||||
panic("blockIntelShaGo called unexpectedly")
|
||||
|
||||
}
|
||||
|
||||
func blockArmSha2Go(dig *digest, p []byte) {
|
||||
panic("blockArmSha2Go called unexpectedly")
|
||||
}
|
||||
@@ -1,15 +0,0 @@
|
||||
#!/bin/sh
|
||||
|
||||
set -e
|
||||
|
||||
go tool dist list | while IFS=/ read os arch; do
|
||||
echo "Checking $os/$arch..."
|
||||
echo " normal"
|
||||
GOARCH=$arch GOOS=$os go build -o /dev/null ./...
|
||||
echo " noasm"
|
||||
GOARCH=$arch GOOS=$os go build -tags noasm -o /dev/null ./...
|
||||
echo " appengine"
|
||||
GOARCH=$arch GOOS=$os go build -tags appengine -o /dev/null ./...
|
||||
echo " noasm,appengine"
|
||||
GOARCH=$arch GOOS=$os go build -tags 'appengine noasm' -o /dev/null ./...
|
||||
done
|
||||
Reference in New Issue
Block a user