flat assembler

Index > Tutorials and Examples > CRC-32

Author

Thread

Ali.Z

Joined: 08 Jan 2018
Posts: 846

Ali.Z 10 Jul 2020, 00:39

here is an optimized version of CRC-32, do not be confused with other CRC-32 variants; this is a plain CRC-32.

Code:

  format PE GUI 4.0
  include 'win32a.inc'

section '.text' code readable executable

    call      computecrc32table
    push      [_len]
    push      _data
    call      calccrc32
    push      10h
    push      crc32string
    push      eax
    call      [itoa]
    add       esp,0Ch
    push      0
    push      eax
    push      0
    push      0
    call      [MessageBox]
    sub       eax,eax
    ret
    int3

calccrc32:
    push      ebx
    mov       ebx,[esp+12] ;len
    test      ebx,ebx
    mov       eax,ebx
    jz        short .zero
    mov       edx,[esp+8] ;data
    mov       eax,-1
    add       ebx,edx
  .calc:
    mov       ecx,eax
    shr       eax,8
    xor       cl,byte [edx] 
    add       edx,1
    movzx     ecx,cl
    xor       eax,[crc32table+ecx*4]
    cmp       ebx,edx
    jnz       short .calc
    not       eax
  .zero:
    pop       ebx
    ret       8

computecrc32table:
    push      ebx
    push      esi
    xor       esi,esi
  .compute:
    mov       eax,esi
    mov       ecx,8
  .octet:
    mov       ebx,eax
    shr       eax,1
    mov       edx,eax
    xor       edx,0EDB88320h
    test      ebx,1
    cmovnz    eax,edx
    sub       ecx,1
    jnz       short .octet
    mov       [crc32table+esi*4],eax
    add       esi,1
    cmp       esi,100h
    jnz       short .compute
    pop       esi
    pop       ebx
    ret

section '.data' data readable writeable

  crc32table rd 100h


  _len dd _data_len - _len - 4

  _data db 'fasm'
_data_len:

align 4
  crc32string db ?


section '.idata' import data readable

  library user32,'user32.dll',\
          ntdll,'ntdll.dll'

  import user32,\
         MessageBox,'MessageBoxA'

  import ntdll,\
         itoa,'_itoa'

i didnt include a computed crc32 table for benchmark reasons as well as making the code for computing crc32 table available; but here i include a pre-computed one from same compute function above.

Code:

dd 00000000h, 77073096h, 0EE0E612Ch, 990951BAh, 076DC419h, 706AF48Fh, 0E963A535h, 9E6495A3h
dd 0EDB8832h, 79DCB8A4h, 0E0D5E91Eh, 97D2D988h, 09B64C2Bh, 7EB17CBDh, 0E7B82D07h, 90BF1D91h
dd 1DB71064h, 6AB020F2h, 0F3B97148h, 84BE41DEh, 1ADAD47Dh, 6DDDE4EBh, 0F4D4B551h, 83D385C7h
dd 136C9856h, 646BA8C0h, 0FD62F97Ah, 8A65C9ECh, 14015C4Fh, 63066CD9h, 0FA0F3D63h, 8D080DF5h
dd 3B6E20C8h, 4C69105Eh, 0D56041E4h, 0A2677172h, 3C03E4D1h, 4B04D447h, 0D20D85FDh, 0A50AB56Bh
dd 35B5A8FAh, 42B2986Ch, 0DBBBC9D6h, 0ACBCF940h, 32D86CE3h, 45DF5C75h, 0DCD60DCFh, 0ABD13D59h
dd 26D930ACh, 51DE003Ah, 0C8D75180h, 0BFD06116h, 21B4F4B5h, 56B3C423h, 0CFBA9599h, 0B8BDA50Fh
dd 2802B89Eh, 5F058808h, 0C60CD9B2h, 0B10BE924h, 2F6F7C87h, 58684C11h, 0C1611DABh, 0B6662D3Dh
dd 76DC4190h, 01DB7106h, 98D220BCh, 0EFD5102Ah, 71B18589h, 06B6B51Fh, 9FBFE4A5h, 0E8B8D433h
dd 7807C9A2h, 0F00F934h, 9609A88Eh, 0E10E9818h, 7F6A0DBBh, 086D3D2Dh, 91646C97h, 0E6635C01h
dd 6B6B51F4h, 1C6C6162h, 856530D8h, 0F262004Eh, 6C0695EDh, 1B01A57Bh, 8208F4C1h, 0F50FC457h
dd 65B0D9C6h, 12B7E950h, 8BBEB8EAh, 0FCB9887Ch, 62DD1DDFh, 15DA2D49h, 8CD37CF3h, 0FBD44C65h
dd 4DB26158h, 3AB551CEh, 0A3BC0074h, 0D4BB30E2h, 4ADFA541h, 3DD895D7h, 0A4D1C46Dh, 0D3D6F4FBh
dd 4369E96Ah, 346ED9FCh, 0AD678846h, 0DA60B8D0h, 44042D73h, 33031DE5h, 0AA0A4C5Fh, 0DD0D7CC9h
dd 5005713Ch, 270241AAh, 0BE0B1010h, 0C90C2086h, 5768B525h, 206F85B3h, 0B966D409h, 0CE61E49Fh
dd 5EDEF90Eh, 29D9C998h, 0B0D09822h, 0C7D7A8B4h, 59B33D17h, 2EB40D81h, 0B7BD5C3Bh, 0C0BA6CADh
dd 0EDB88320h, 9ABFB3B6h, 03B6E20Ch, 74B1D29Ah, 0EAD54739h, 9DD277AFh, 04DB2615h, 73DC1683h
dd 0E3630B12h, 94643B84h, 0D6D6A3Eh, 7A6A5AA8h, 0E40ECF0Bh, 9309FF9Dh, 0A00AE27h, 7D079EB1h
dd 0F00F9344h, 8708A3D2h, 1E01F268h, 6906C2FEh, 0F762575Dh, 806567CBh, 196C3671h, 6E6B06E7h
dd 0FED41B76h, 89D32BE0h, 10DA7A5Ah, 67DD4ACCh, 0F9B9DF6Fh, 8EBEEFF9h, 17B7BE43h, 60B08ED5h
dd 0D6D6A3E8h, 0A1D1937Eh, 38D8C2C4h, 4FDFF252h, 0D1BB67F1h, 0A6BC5767h, 3FB506DDh, 48B2364Bh
dd 0D80D2BDAh, 0AF0A1B4Ch, 36034AF6h, 41047A60h, 0DF60EFC3h, 0A867DF55h, 316E8EEFh, 4669BE79h
dd 0CB61B38Ch, 0BC66831Ah, 256FD2A0h, 5268E236h, 0CC0C7795h, 0BB0B4703h, 220216B9h, 5505262Fh
dd 0C5BA3BBEh, 0B2BD0B28h, 2BB45A92h, 5CB36A04h, 0C2D7FFA7h, 0B5D0CF31h, 2CD99E8Bh, 5BDEAE1Dh
dd 9B64C2B0h, 0EC63F226h, 756AA39Ch, 026D930Ah, 9C0906A9h, 0EB0E363Fh, 72076785h, 05005713h
dd 95BF4A82h, 0E2B87A14h, 7BB12BAEh, 0CB61B38h, 92D28E9Bh, 0E5D5BE0Dh, 7CDCEFB7h, 0BDBDF21h
dd 86D3D2D4h, 0F1D4E242h, 68DDB3F8h, 1FDA836Eh, 81BE16CDh, 0F6B9265Bh, 6FB077E1h, 18B74777h
dd 88085AE6h, 0FF0F6A70h, 66063BCAh, 11010B5Ch, 8F659EFFh, 0F862AE69h, 616BFFD3h, 166CCF45h
dd 0A00AE278h, 0D70DD2EEh, 4E048354h, 3903B3C2h, 0A7672661h, 0D06016F7h, 4969474Dh, 3E6E77DBh
dd 0AED16A4Ah, 0D9D65ADCh, 40DF0B66h, 37D83BF0h, 0A9BCAE53h, 0DEBB9EC5h, 47B2CF7Fh, 30B5FFE9h
dd 0BDBDF21Ch, 0CABAC28Ah, 53B39330h, 24B4A3A6h, 0BAD03605h, 0CDD70693h, 54DE5729h, 23D967BFh
dd 0B3667A2Eh, 0C4614AB8h, 5D681B02h, 2A6F2B94h, 0B40BBE37h, 0C30C8EA1h, 5A05DF1Bh, 2D02EF8Dh

notice the wavy or whatever pattern name you want to call it, it cycles back and forth every 4 rows; you woundt notice this pattern with regular 0x.

_________________
Asm For Wise Humans

Last edited by Ali.Z on 18 Dec 2025, 11:00; edited 3 times in total

10 Jul 2020, 00:39

bzt

Joined: 09 Nov 2018
Posts: 98

bzt 22 Dec 2020, 14:50

Here's how I do it (CRC-32 with Castagnoli polynom and pre-calculated lookup table):

Code:

; --- CRC-32c ---
            ; IN: esi = buffer, ecx = length
            ; OUT: edx = crc
crc32_calc: xor         edx, edx
            xor         eax, eax
            xor         ebx, ebx
.next:      lodsb
            mov         bl, dl
            xor         bl, al
            mov         eax, edx
            shr         edx, 8
            xor         edx, dword [crclkp+4*ebx]
            dec         ecx
            jnz         .next
            ret

crclkp:     dd          000000000h, 0F26B8303h, 0E13B70F7h, 01350F3F4h, 0C79A971Fh, 035F1141Ch, 026A1E7E8h, 0D4CA64EBh
            dd          08AD958CFh, 078B2DBCCh, 06BE22838h, 09989AB3Bh, 04D43CFD0h, 0BF284CD3h, 0AC78BF27h, 05E133C24h
            dd          0105EC76Fh, 0E235446Ch, 0F165B798h, 0030E349Bh, 0D7C45070h, 025AFD373h, 036FF2087h, 0C494A384h
            dd          09A879FA0h, 068EC1CA3h, 07BBCEF57h, 089D76C54h, 05D1D08BFh, 0AF768BBCh, 0BC267848h, 04E4DFB4Bh
            dd          020BD8EDEh, 0D2D60DDDh, 0C186FE29h, 033ED7D2Ah, 0E72719C1h, 0154C9AC2h, 0061C6936h, 0F477EA35h
            dd          0AA64D611h, 0580F5512h, 04B5FA6E6h, 0B93425E5h, 06DFE410Eh, 09F95C20Dh, 08CC531F9h, 07EAEB2FAh
            dd          030E349B1h, 0C288CAB2h, 0D1D83946h, 023B3BA45h, 0F779DEAEh, 005125DADh, 01642AE59h, 0E4292D5Ah
            dd          0BA3A117Eh, 04851927Dh, 05B016189h, 0A96AE28Ah, 07DA08661h, 08FCB0562h, 09C9BF696h, 06EF07595h
            dd          0417B1DBCh, 0B3109EBFh, 0A0406D4Bh, 0522BEE48h, 086E18AA3h, 0748A09A0h, 067DAFA54h, 095B17957h
            dd          0CBA24573h, 039C9C670h, 02A993584h, 0D8F2B687h, 00C38D26Ch, 0FE53516Fh, 0ED03A29Bh, 01F682198h
            dd          05125DAD3h, 0A34E59D0h, 0B01EAA24h, 042752927h, 096BF4DCCh, 064D4CECFh, 077843D3Bh, 085EFBE38h
            dd          0DBFC821Ch, 02997011Fh, 03AC7F2EBh, 0C8AC71E8h, 01C661503h, 0EE0D9600h, 0FD5D65F4h, 00F36E6F7h
            dd          061C69362h, 093AD1061h, 080FDE395h, 072966096h, 0A65C047Dh, 05437877Eh, 04767748Ah, 0B50CF789h
            dd          0EB1FCBADh, 0197448AEh, 00A24BB5Ah, 0F84F3859h, 02C855CB2h, 0DEEEDFB1h, 0CDBE2C45h, 03FD5AF46h
            dd          07198540Dh, 083F3D70Eh, 090A324FAh, 062C8A7F9h, 0B602C312h, 044694011h, 05739B3E5h, 0A55230E6h
            dd          0FB410CC2h, 0092A8FC1h, 01A7A7C35h, 0E811FF36h, 03CDB9BDDh, 0CEB018DEh, 0DDE0EB2Ah, 02F8B6829h
            dd          082F63B78h, 0709DB87Bh, 063CD4B8Fh, 091A6C88Ch, 0456CAC67h, 0B7072F64h, 0A457DC90h, 0563C5F93h
            dd          0082F63B7h, 0FA44E0B4h, 0E9141340h, 01B7F9043h, 0CFB5F4A8h, 03DDE77ABh, 02E8E845Fh, 0DCE5075Ch
            dd          092A8FC17h, 060C37F14h, 073938CE0h, 081F80FE3h, 055326B08h, 0A759E80Bh, 0B4091BFFh, 0466298FCh
            dd          01871A4D8h, 0EA1A27DBh, 0F94AD42Fh, 00B21572Ch, 0DFEB33C7h, 02D80B0C4h, 03ED04330h, 0CCBBC033h
            dd          0A24BB5A6h, 0502036A5h, 04370C551h, 0B11B4652h, 065D122B9h, 097BAA1BAh, 084EA524Eh, 07681D14Dh
            dd          02892ED69h, 0DAF96E6Ah, 0C9A99D9Eh, 03BC21E9Dh, 0EF087A76h, 01D63F975h, 00E330A81h, 0FC588982h
            dd          0B21572C9h, 0407EF1CAh, 0532E023Eh, 0A145813Dh, 0758FE5D6h, 087E466D5h, 094B49521h, 066DF1622h
            dd          038CC2A06h, 0CAA7A905h, 0D9F75AF1h, 02B9CD9F2h, 0FF56BD19h, 00D3D3E1Ah, 01E6DCDEEh, 0EC064EEDh
            dd          0C38D26C4h, 031E6A5C7h, 022B65633h, 0D0DDD530h, 00417B1DBh, 0F67C32D8h, 0E52CC12Ch, 01747422Fh
            dd          049547E0Bh, 0BB3FFD08h, 0A86F0EFCh, 05A048DFFh, 08ECEE914h, 07CA56A17h, 06FF599E3h, 09D9E1AE0h
            dd          0D3D3E1ABh, 021B862A8h, 032E8915Ch, 0C083125Fh, 0144976B4h, 0E622F5B7h, 0F5720643h, 007198540h
            dd          0590AB964h, 0AB613A67h, 0B831C993h, 04A5A4A90h, 09E902E7Bh, 06CFBAD78h, 07FAB5E8Ch, 08DC0DD8Fh
            dd          0E330A81Ah, 0115B2B19h, 0020BD8EDh, 0F0605BEEh, 024AA3F05h, 0D6C1BC06h, 0C5914FF2h, 037FACCF1h
            dd          069E9F0D5h, 09B8273D6h, 088D28022h, 07AB90321h, 0AE7367CAh, 05C18E4C9h, 04F48173Dh, 0BD23943Eh
            dd          0F36E6F75h, 00105EC76h, 012551F82h, 0E03E9C81h, 034F4F86Ah, 0C69F7B69h, 0D5CF889Dh, 027A40B9Eh
            dd          079B737BAh, 08BDCB4B9h, 0988C474Dh, 06AE7C44Eh, 0BE2DA0A5h, 04C4623A6h, 05F16D052h, 0AD7D5351h

Note: this is the CRC that might have hardware implementation on Intel CPUs. Used mostly in networking. For the "standard" CRC, also called ANSI CRC (used by GPT, png and gzip for example) you'll have to XOR the checksum before and after the loop with 0FFFFFFFFh and the lookup table is different of course.

Cheers,
bzt

22 Dec 2020, 14:50

macomics

Joined: 26 Jan 2021
Posts: 1208
Location: Russia

macomics 21 Aug 2021, 19:59

Or you can do it like this.

Code:

; IN EAX=Current CRC32 xor new_byte
; IN EBX=HASH code (0EDB88320h)
CharCRC32:
 repeat 8
  shr eax, 1
  sbb edx, edx
  and edx, ebx
  xor eax, eax
 end repeat
 retn

Code:

; IN EBX = HASH code (0EDB88320h)
; IN EDI = TABLE buffer
CRC32Table:
 xor ecx, ecx
 @@:
  mov eax, ecx
  call CharCRC32
  mov [edi + ecx * 4], eax
  inc cl
  jnz @b
 retn

Code:

; IN EBX = TABLE buffer
; IN EAX = Prev CRC32 value (start 0)
; IN ESI = Buffer
; IN ECX = Length
; OUT EAX = CRC32
 CRC32ByTable:
 mov edx, eax
 not edx
 @@:
  movzx eax, dl
  xor eax, byte [esi]
  shr edx, 8
  inc esi
  xor edx, [ebx + eax * 4]
 loop @b
 not edx
 mov eax, edx
 retn

Code:

; IN EBX = HASH code (0EDB88320h)
; IN EAX = Prev CRC32 value (start 0)
; IN ESI = Buffer
; IN ECX = Length
; OUT EAX = CRC32
CRC32OnTheFly:
 not eax
 @@:
  xor al, byte [esi]
  call CharCRC32
  inc esi
 loop @b
 not eax
 retn

I typed in the browser - sorry for the typos

21 Aug 2021, 19:59

sylware

Joined: 23 Oct 2020
Posts: 521
Location: Marseille/France

sylware 22 Aug 2021, 13:33

It would be interesting to get some zlib crc32 implementations. There is a sse accelerated implementation one in linux (modulo the xor 0xffffffff stuff).
I am looking for an AVX2 (256bits) accelerated one, if it has some meaning (on ymm regs instead of xmm regs), I plan to have a look at zlib-ng, but it uses abominations which are the gcc intrinsics.

22 Aug 2021, 13:33

macomics

Joined: 26 Jan 2021
Posts: 1208
Location: Russia

macomics 22 Aug 2021, 16:40

On processors with SSE4.(2), there is a built-in CRC32 command (See Intel SDM Vol 2A 3-329)

22 Aug 2021, 16:40

sylware

Joined: 23 Oct 2020
Posts: 521
Location: Marseille/France

sylware 22 Aug 2021, 17:51

internet says intel "crc32" is actually crc32c which is not zlib crc32. I recall a post from adler himself.

22 Aug 2021, 17:51

macomics

Joined: 26 Jan 2021
Posts: 1208
Location: Russia

macomics 22 Aug 2021, 21:43

sylware wrote:

internet says intel "crc32" is actually crc32c which is not zlib crc32. I recall a post from adler himself.

They differ only by a polynomial. If there is no need for compatibility with other zlib implementations, then CRC32 is fully suitable with the polynomial proposed by Intel. Although they could make the command more flexible Sad

22 Aug 2021, 21:43

sylware

Joined: 23 Oct 2020
Posts: 521
Location: Marseille/France

sylware 22 Aug 2021, 23:15

that's why I was asking about an AVX2 implementation of zlib crc32.

22 Aug 2021, 23:15

bzt

Joined: 09 Nov 2018
Posts: 98

bzt 01 Sep 2021, 15:47

macomics wrote:

sylware wrote:
internet says intel "crc32" is actually crc32c which is not zlib crc32. I recall a post from adler himself.
They differ only by a polynomial.

Nope, not just in the polynomial (aka. lookup table), there are additional operations required. To modify my previous CRC-32c example to make it ANSI CRC, you'd need

Code:

; --- CRC-32 (ANSI, like in zlib) ---
            ; IN: esi = buffer, ecx = length
            ; OUT: edx = crc
crc32_calc: xor         edx, edx
            xor         eax, eax
            xor         ebx, ebx
            dec         edx                  ; <- crc value starts with 0FFFFFFFFh and not 0
.next:      lodsb
            mov         bl, dl
            xor         bl, al
            mov         eax, edx
            shr         edx, 8
            xor         edx, dword [crclkp+4*ebx]
            dec         ecx
            jnz         .next
            xor         edx, 0FFFFFFFFh      ; <- you must also invert bits after the calculation
            ret

; ...and the lookup table is entirely different of course
crclkp:
    db  000000000h, 077073096h, 0ee0e612ch, 0990951bah, 0076dc419h, 0706af48fh, 0e963a535h, 09e6495a3h, 00edb8832h
    db  079dcb8a4h, 0e0d5e91eh, 097d2d988h, 009b64c2bh, 07eb17cbdh, 0e7b82d07h, 090bf1d91h, 01db71064h, 06ab020f2h
    db  0f3b97148h, 084be41deh, 01adad47dh, 06ddde4ebh, 0f4d4b551h, 083d385c7h, 0136c9856h, 0646ba8c0h, 0fd62f97ah
    db  08a65c9ech, 014015c4fh, 063066cd9h, 0fa0f3d63h, 08d080df5h, 03b6e20c8h, 04c69105eh, 0d56041e4h, 0a2677172h
    db  03c03e4d1h, 04b04d447h, 0d20d85fdh, 0a50ab56bh, 035b5a8fah, 042b2986ch, 0dbbbc9d6h, 0acbcf940h, 032d86ce3h
    db  045df5c75h, 0dcd60dcfh, 0abd13d59h, 026d930ach, 051de003ah, 0c8d75180h, 0bfd06116h, 021b4f4b5h, 056b3c423h
    db  0cfba9599h, 0b8bda50fh, 02802b89eh, 05f058808h, 0c60cd9b2h, 0b10be924h, 02f6f7c87h, 058684c11h, 0c1611dabh
    db  0b6662d3dh, 076dc4190h, 001db7106h, 098d220bch, 0efd5102ah, 071b18589h, 006b6b51fh, 09fbfe4a5h, 0e8b8d433h
    db  07807c9a2h, 00f00f934h, 09609a88eh, 0e10e9818h, 07f6a0dbbh, 0086d3d2dh, 091646c97h, 0e6635c01h, 06b6b51f4h
    db  01c6c6162h, 0856530d8h, 0f262004eh, 06c0695edh, 01b01a57bh, 08208f4c1h, 0f50fc457h, 065b0d9c6h, 012b7e950h
    db  08bbeb8eah, 0fcb9887ch, 062dd1ddfh, 015da2d49h, 08cd37cf3h, 0fbd44c65h, 04db26158h, 03ab551ceh, 0a3bc0074h
    db  0d4bb30e2h, 04adfa541h, 03dd895d7h, 0a4d1c46dh, 0d3d6f4fbh, 04369e96ah, 0346ed9fch, 0ad678846h, 0da60b8d0h
    db  044042d73h, 033031de5h, 0aa0a4c5fh, 0dd0d7cc9h, 05005713ch, 0270241aah, 0be0b1010h, 0c90c2086h, 05768b525h
    db  0206f85b3h, 0b966d409h, 0ce61e49fh, 05edef90eh, 029d9c998h, 0b0d09822h, 0c7d7a8b4h, 059b33d17h, 02eb40d81h
    db  0b7bd5c3bh, 0c0ba6cadh, 0edb88320h, 09abfb3b6h, 003b6e20ch, 074b1d29ah, 0ead54739h, 09dd277afh, 004db2615h
    db  073dc1683h, 0e3630b12h, 094643b84h, 00d6d6a3eh, 07a6a5aa8h, 0e40ecf0bh, 09309ff9dh, 00a00ae27h, 07d079eb1h
    db  0f00f9344h, 08708a3d2h, 01e01f268h, 06906c2feh, 0f762575dh, 0806567cbh, 0196c3671h, 06e6b06e7h, 0fed41b76h
    db  089d32be0h, 010da7a5ah, 067dd4acch, 0f9b9df6fh, 08ebeeff9h, 017b7be43h, 060b08ed5h, 0d6d6a3e8h, 0a1d1937eh
    db  038d8c2c4h, 04fdff252h, 0d1bb67f1h, 0a6bc5767h, 03fb506ddh, 048b2364bh, 0d80d2bdah, 0af0a1b4ch, 036034af6h
    db  041047a60h, 0df60efc3h, 0a867df55h, 0316e8eefh, 04669be79h, 0cb61b38ch, 0bc66831ah, 0256fd2a0h, 05268e236h
    db  0cc0c7795h, 0bb0b4703h, 0220216b9h, 05505262fh, 0c5ba3bbeh, 0b2bd0b28h, 02bb45a92h, 05cb36a04h, 0c2d7ffa7h
    db  0b5d0cf31h, 02cd99e8bh, 05bdeae1dh, 09b64c2b0h, 0ec63f226h, 0756aa39ch, 0026d930ah, 09c0906a9h, 0eb0e363fh
    db  072076785h, 005005713h, 095bf4a82h, 0e2b87a14h, 07bb12baeh, 00cb61b38h, 092d28e9bh, 0e5d5be0dh, 07cdcefb7h
    db  00bdbdf21h, 086d3d2d4h, 0f1d4e242h, 068ddb3f8h, 01fda836eh, 081be16cdh, 0f6b9265bh, 06fb077e1h, 018b74777h
    db  088085ae6h, 0ff0f6a70h, 066063bcah, 011010b5ch, 08f659effh, 0f862ae69h, 0616bffd3h, 0166ccf45h, 0a00ae278h
    db  0d70dd2eeh, 04e048354h, 03903b3c2h, 0a7672661h, 0d06016f7h, 04969474dh, 03e6e77dbh, 0aed16a4ah, 0d9d65adch
    db  040df0b66h, 037d83bf0h, 0a9bcae53h, 0debb9ec5h, 047b2cf7fh, 030b5ffe9h, 0bdbdf21ch, 0cabac28ah, 053b39330h
    db  024b4a3a6h, 0bad03605h, 0cdd70693h, 054de5729h, 023d967bfh, 0b3667a2eh, 0c4614ab8h, 05d681b02h, 02a6f2b94h
    db  0b40bbe37h, 0c30c8ea1h, 05a05df1bh, 02d02ef8dh

sylware wrote:

that's why I was asking about an AVX2 implementation of zlib crc32.

I'm sorry, there's no AVX accelerated Open Source ANSI CRC implementation that I know of. But using my code as skeleton it shouldn't be hard to apply the same operations on SIMD registers.
Otherwise use "gcc -S zlib/crc32_simd.c" to convert the intrinsics into Assembly, but remember it's going to be AT&T syntax, not fasm's (source and destination arguments swapped).

Cheers,
bzt

01 Sep 2021, 15:47

bitRAKE

Joined: 21 Jul 2003
Posts: 4364
Location: vpcmpistri

bitRAKE 01 Sep 2021, 20:06

https://gcc.godbolt.org/z/edobc3GYY
(I think hand coded asm could be much smaller.)

The Paper is a good example of Barrett reduction method - widely applicable in GF(2).

01 Sep 2021, 20:06

wht36

Joined: 18 Sep 2005
Posts: 110

wht36 08 Dec 2025, 10:48

Here's a very basic version (non-optimised, very slow)

Code:

; Compute CRC32 for data validation
; #define CRC32_POLY 0xEDB88320
;uint32_t nz_crc32(const uint8_t *data, size_t len) {
;    uint32_t crc = 0xFFFFFFFF;
;    for(size_t i=0; i<len; i++) {
;        crc ^= data[i];
;        for(int j=0; j<8; j++)
;            crc = (crc >> 1) ^ (CRC32_POLY & -(crc & 1));
;    }
;    return ~crc;
;}

align 16
naive_crc32:
        mov  eax, -1    ; uint32_t crc = 0xFFFFFFFF
        lea     r8,[rcx-1]; r8 = data+i
        add     rdx,rcx ; rdx = end of data
.i:     inc     r8              ; for(size_t i=0; i<len; i++)
        cmp     r8, rdx
        jae     .z
        movzx ecx, BYTE [r8]    ; load byte from data[i]
        xor     eax, ecx        ; crc ^= data[i];
        mov     r9,9            ; for(int j=0; j<8; j++)
.j:     dec     r9
        jz      .i              ; crc = (crc >> 1) ^ (CRC32_POLY & -(crc & 1));
        mov     ecx,eax
        shr     eax, 1
        and     ecx, 1
        neg     ecx
        and     ecx, 0xEDB88320
        xor     eax, ecx
        jmp .j
.z:     not eax
        ret

Intel has various optimised flavours of crc32 at
https://github.com/intel/isa-l

They are pretty fast. Intel uses NASM/YASM so one would need to modify it.
An avx version for standard crc32 (used in e.g. gzip) modified for FASM follows:

Code:

macro vpslldq op1,op2,op3 {
        if op3 eq
                vpslldq op1,op1,op2
        else
                vpslldq op1,op2,op3
        end if  
}

macro vpand op1,op2,op3 {
        if op3 eq
                vpand op1,op1,op2
        else
                vpand op1,op2,op3
        end if  
}

macro vpsrldq op1,op2,op3 {
        if op3 eq
                vpsrldq op1,op1,op2
        else
                vpsrldq op1,op2,op3
        end if  
}

macro vpxor op1,op2,op3 {
        if op3 eq
                vpxor op1,op1,op2
        else
                vpxor op1,op2,op3
        end if  
}

macro vxorps op1,op2,op3 {
        if op3 eq
                vxorps op1,op1,op2
        else
                vxorps op1,op2,op3
        end if  
}

macro vpshufb op1,op2,op3 {
        if op3 eq
                vpshufb op1,op1,op2
        else
                vpshufb op1,op2,op3
        end if  
}
macro vpclmulqdq op1,op2,op3,op4 {
        if op4 eq
                vpclmulqdq op1,op1,op2,op3
        else
                vpclmulqdq op1,op2,op3,op4
        end if  
}

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;  Copyright(c) 2011-2020 Intel Corporation All rights reserved.
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;    * Redistributions of source code must retain the above copyright
;      notice, this list of conditions and the following disclaimer.
;    * Redistributions in binary form must reproduce the above copyright
;      notice, this list of conditions and the following disclaimer in
;      the documentation and/or other materials provided with the
;      distribution.
;    * Neither the name of Intel Corporation nor the names of its
;      contributors may be used to endorse or promote products derived
;      from this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
;  OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
;  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
;  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
;  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
;  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
;  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
;  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;       Function API:
;       UINT32 crc32_gzip_refl_by8_02(
;               UINT32 init_crc, //initial CRC value, 32 bits
;               const unsigned char *buf, //buffer pointer to calculate CRC on
;               UINT64 len //buffer length in bytes (64-bit data)
;       );
;
;       Authors:
;               Erdinc Ozturk
;               Vinodh Gopal
;               James Guilford
;
;       Reference paper titled "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
;       URL: http://download.intel.com/design/intarch/papers/323102.pdf
;
;
;       sample yasm command line:
;       yasm -f x64 -f elf64 -X gnu -g dwarf2 crc32_gzip_refl_by8
;
;       As explained here:
;       http://docs.oracle.com/javase/7/docs/api/java/util/zip/package-summary.html
;       CRC-32 checksum is described in RFC 1952
;       Implementing RFC 1952 CRC:
;       http://www.ietf.org/rfc/rfc1952.txt

fetch_dist      equ 4096
PREFETCH        equ prefetcht1
arg1 equ rcx
arg2 equ rdx
arg3 equ r8
arg1_low32 equ ecx

TMP equ 16*0
XMM_SAVE equ 16*2
VARIABLE_OFFSET equ 16*10+8

align 16

crc32_gzip_refl_by8_02:
        not             arg1_low32
        sub             rsp, VARIABLE_OFFSET

        ; push the xmm registers into the stack to maintain
        vmovdqa         [rsp + XMM_SAVE + 16*0], xmm6
        vmovdqa         [rsp + XMM_SAVE + 16*1], xmm7
        vmovdqa         [rsp + XMM_SAVE + 16*2], xmm8
        vmovdqa         [rsp + XMM_SAVE + 16*3], xmm9
        vmovdqa         [rsp + XMM_SAVE + 16*4], xmm10
        vmovdqa         [rsp + XMM_SAVE + 16*5], xmm11
        vmovdqa         [rsp + XMM_SAVE + 16*6], xmm12
        vmovdqa         [rsp + XMM_SAVE + 16*7], xmm13

        ; check if smaller than 256B
        cmp             arg3, 256
        jl              .less_than_256

        ; load the initial crc value
        vmovd           xmm10, arg1_low32      ; initial crc

        ; receive the initial 64B data, xor the initial crc value
        vmovdqu         xmm0, [arg2+16*0]
        vmovdqu         xmm1, [arg2+16*1]
        vmovdqu         xmm2, [arg2+16*2]
        vmovdqu         xmm3, [arg2+16*3]
        vmovdqu         xmm4, [arg2+16*4]
        vmovdqu         xmm5, [arg2+16*5]
        vmovdqu         xmm6, [arg2+16*6]
        vmovdqu         xmm7, [arg2+16*7]

        ; XOR the initial_crc value
        vpxor           xmm0, xmm10
        vmovdqa         xmm10, [rk3]    ;xmm10 has rk3 and rk4
                                        ;imm value of pclmulqdq instruction will determine which constant to use
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        ; we subtract 256 instead of 128 to save one instruction from the loop
        sub             arg3, 256

        ; at this section of the code, there is 128*x+y (0<=y<128) bytes of buffer. The fold_128_B_loop
        ; loop will fold 128B at a time until we have 128+y Bytes of buffer

        ; check if there is at least 4KB (fetch distance) + 128B in the buffer
        cmp             arg3, (fetch_dist + 128)
        jb              .fold_128_B_loop

        ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
align 16
.fold_and_prefetch_128_B_loop:
        add             arg2, 128
        PREFETCH        [arg2+fetch_dist+0]
        vmovdqu         xmm9, [arg2+16*0]
        vmovdqu         xmm12, [arg2+16*1]
        vpclmulqdq      xmm8, xmm0, xmm10, 0x10
        vpclmulqdq      xmm0, xmm0, xmm10 , 0x1
        vpclmulqdq      xmm13, xmm1, xmm10, 0x10
        vpclmulqdq      xmm1, xmm1, xmm10 , 0x1
        vpxor           xmm0, xmm9
        vxorps          xmm0, xmm8
        vpxor           xmm1, xmm12
        vxorps          xmm1, xmm13

        vmovdqu         xmm9, [arg2+16*2]
        vmovdqu         xmm12, [arg2+16*3]
        vpclmulqdq      xmm8, xmm2, xmm10, 0x10
        vpclmulqdq      xmm2, xmm2, xmm10 , 0x1
        vpclmulqdq      xmm13, xmm3, xmm10, 0x10
        vpclmulqdq      xmm3, xmm3, xmm10 , 0x1
        vpxor           xmm2, xmm9
        vxorps          xmm2, xmm8
        vpxor           xmm3, xmm12
        vxorps          xmm3, xmm13

        PREFETCH        [arg2+fetch_dist+64]
        vmovdqu         xmm9, [arg2+16*4]
        vmovdqu         xmm12, [arg2+16*5]
        vpclmulqdq      xmm8, xmm4, xmm10, 0x10
        vpclmulqdq      xmm4, xmm4, xmm10 , 0x1
        vpclmulqdq      xmm13, xmm5, xmm10, 0x10
        vpclmulqdq      xmm5, xmm5, xmm10 , 0x1
        vpxor           xmm4, xmm9
        vxorps          xmm4, xmm8
        vpxor           xmm5, xmm12
        vxorps          xmm5, xmm13

        vmovdqu         xmm9, [arg2+16*6]
        vmovdqu         xmm12, [arg2+16*7]
        vpclmulqdq      xmm8, xmm6, xmm10, 0x10
        vpclmulqdq      xmm6, xmm6, xmm10 , 0x1
        vpclmulqdq      xmm13, xmm7, xmm10, 0x10
        vpclmulqdq      xmm7, xmm7, xmm10 , 0x1
        vpxor           xmm6, xmm9
        vxorps          xmm6, xmm8
        vpxor           xmm7, xmm12
        vxorps          xmm7, xmm13

        sub             arg3, 128

        ; check if there is another 4KB (fetch distance) + 128B in the buffer
        cmp             arg3, (fetch_dist + 128)
        jge             .fold_and_prefetch_128_B_loop


        ; fold 128B at a time. This section of the code folds 8 xmm registers in parallel
align 16
.fold_128_B_loop:
        add             arg2, 128

        vmovdqu         xmm9, [arg2+16*0]
        vmovdqu         xmm12, [arg2+16*1]
        vpclmulqdq      xmm8, xmm0, xmm10, 0x10
        vpclmulqdq      xmm0, xmm0, xmm10 , 0x1
        vpclmulqdq      xmm13, xmm1, xmm10, 0x10
        vpclmulqdq      xmm1, xmm1, xmm10 , 0x1
        vpxor           xmm0, xmm9
        vxorps          xmm0, xmm8
        vpxor           xmm1, xmm12
        vxorps          xmm1, xmm13

        vmovdqu         xmm9, [arg2+16*2]
        vmovdqu         xmm12, [arg2+16*3]
        vpclmulqdq      xmm8, xmm2, xmm10, 0x10
        vpclmulqdq      xmm2, xmm2, xmm10 , 0x1
        vpclmulqdq      xmm13, xmm3, xmm10, 0x10
        vpclmulqdq      xmm3, xmm3, xmm10 , 0x1
        vpxor           xmm2, xmm9
        vxorps          xmm2, xmm8
        vpxor           xmm3, xmm12
        vxorps          xmm3, xmm13

        vmovdqu         xmm9, [arg2+16*4]
        vmovdqu         xmm12, [arg2+16*5]
        vpclmulqdq      xmm8, xmm4, xmm10, 0x10
        vpclmulqdq      xmm4, xmm4, xmm10 , 0x1
        vpclmulqdq      xmm13, xmm5, xmm10, 0x10
        vpclmulqdq      xmm5, xmm5, xmm10 , 0x1
        vpxor           xmm4, xmm9
        vxorps          xmm4, xmm8
        vpxor           xmm5, xmm12
        vxorps          xmm5, xmm13

        vmovdqu         xmm9, [arg2+16*6]
        vmovdqu         xmm12, [arg2+16*7]
        vpclmulqdq      xmm8, xmm6, xmm10, 0x10
        vpclmulqdq      xmm6, xmm6, xmm10 , 0x1
        vpclmulqdq      xmm13, xmm7, xmm10, 0x10
        vpclmulqdq      xmm7, xmm7, xmm10 , 0x1
        vpxor           xmm6, xmm9
        vxorps          xmm6, xmm8
        vpxor           xmm7, xmm12
        vxorps          xmm7, xmm13

        sub             arg3, 128
        jge             .fold_128_B_loop
        ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

        add             arg2, 128
        ; at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128
        ; the 128B of folded data is in 8 of the xmm registers: xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7

        ; fold the 8 xmm registers to 1 xmm register with different constants
        vmovdqa         xmm10, [rk9]
        vpclmulqdq      xmm8, xmm0, xmm10, 0x1
        vpclmulqdq      xmm0, xmm0, xmm10, 0x10
        vpxor           xmm7, xmm8
        vxorps          xmm7, xmm0

        vmovdqa         xmm10, [rk11]
        vpclmulqdq      xmm8, xmm1, xmm10, 0x1
        vpclmulqdq      xmm1, xmm1, xmm10, 0x10
        vpxor           xmm7, xmm8
        vxorps          xmm7, xmm1

        vmovdqa         xmm10, [rk13]
        vpclmulqdq      xmm8, xmm2, xmm10, 0x1
        vpclmulqdq      xmm2, xmm2, xmm10, 0x10
        vpxor           xmm7, xmm8
        vpxor           xmm7, xmm2

        vmovdqa         xmm10, [rk15]
        vpclmulqdq      xmm8, xmm3, xmm10, 0x1
        vpclmulqdq      xmm3, xmm3, xmm10, 0x10
        vpxor           xmm7, xmm8
        vxorps          xmm7, xmm3

        vmovdqa         xmm10, [rk17]
        vpclmulqdq      xmm8, xmm4, xmm10, 0x1
        vpclmulqdq      xmm4, xmm4, xmm10, 0x10
        vpxor           xmm7, xmm8
        vpxor           xmm7, xmm4

        vmovdqa         xmm10, [rk19]
        vpclmulqdq      xmm8, xmm5, xmm10, 0x1
        vpclmulqdq      xmm5, xmm5, xmm10, 0x10
        vpxor           xmm7, xmm8
        vxorps          xmm7, xmm5

        vmovdqa         xmm10, [rk1]
        vpclmulqdq      xmm8, xmm6, xmm10, 0x1
        vpclmulqdq      xmm6, xmm6, xmm10, 0x10
        vpxor           xmm7, xmm8
        vpxor           xmm7, xmm6


        ; instead of 128, we add 128-16 to the loop counter to save 1 instruction from the loop
        ; instead of a cmp instruction, we use the negative flag with the jl instruction
        add             arg3, 128-16
        jl              .final_reduction_for_128

        ; now we have 16+y bytes left to reduce. 16 Bytes is in register xmm7 and the rest is in memory
        ; we can fold 16 bytes at a time if y>=16
        ; continue folding 16B at a time

.16B_reduction_loop:
        vpclmulqdq      xmm8, xmm7, xmm10, 0x1
        vpclmulqdq      xmm7, xmm7, xmm10, 0x10
        vpxor           xmm7, xmm8
        vmovdqu         xmm0, [arg2]
        vpxor           xmm7, xmm0
        add             arg2, 16
        sub             arg3, 16
        ; instead of a cmp instruction, we utilize the flags with the jge instruction
        ; equivalent of: cmp arg3, 16-16
        ; check if there is any more 16B in the buffer to be able to fold
        jge             .16B_reduction_loop

        ;now we have 16+z bytes left to reduce, where 0<= z < 16.
        ;first, we reduce the data in the xmm7 register


.final_reduction_for_128:
        add             arg3, 16
        je              .128_done

        ; here we are getting data that is less than 16 bytes.
        ; since we know that there was data before the pointer, we can offset
        ; the input pointer before the actual point, to receive exactly 16 bytes.
        ; after that the registers need to be adjusted.
.get_last_two_xmms:

        vmovdqa         xmm2, xmm7
        vmovdqu         xmm1, [arg2 - 16 + arg3]

        ; get rid of the extra data that was loaded before
        ; load the shift constant
        lea             rax, [pshufb_shf_table]
        add             rax, arg3
        vmovdqu         xmm0, [rax]

        vpshufb         xmm7, xmm0
        vpxor           xmm0, [mask3]
        vpshufb         xmm2, xmm0

        vpblendvb       xmm2, xmm2, xmm1, xmm0
        ;;;;;;;;;;
        vpclmulqdq      xmm8, xmm7, xmm10, 0x1
        vpclmulqdq      xmm7, xmm7, xmm10, 0x10
        vpxor           xmm7, xmm8
        vpxor           xmm7, xmm2

.128_done:
        ; compute crc of a 128-bit value
        vmovdqa         xmm10, [rk5]
        vmovdqa         xmm0, xmm7

        ;64b fold
        vpclmulqdq      xmm7, xmm10, 0
        vpsrldq         xmm0, 8
        vpxor           xmm7, xmm0

        ;32b fold
        vmovdqa         xmm0, xmm7
        vpslldq         xmm7, 4
        vpclmulqdq      xmm7, xmm10, 0x10
        vpxor           xmm7, xmm0


        ;barrett reduction
.barrett:
        vpand           xmm7, [mask2]
        vmovdqa         xmm1, xmm7
        vmovdqa         xmm2, xmm7
        vmovdqa         xmm10, [rk7]

        vpclmulqdq      xmm7, xmm10, 0
        vpxor           xmm7, xmm2
        vpand           xmm7, [mask]
        vmovdqa         xmm2, xmm7
        vpclmulqdq      xmm7, xmm10, 0x10
        vpxor           xmm7, xmm2
        vpxor           xmm7, xmm1
        vpextrd         eax, xmm7, 2

.cleanup:
        not             eax

        vmovdqa         xmm6, [rsp + XMM_SAVE + 16*0]
        vmovdqa         xmm7, [rsp + XMM_SAVE + 16*1]
        vmovdqa         xmm8, [rsp + XMM_SAVE + 16*2]
        vmovdqa         xmm9, [rsp + XMM_SAVE + 16*3]
        vmovdqa         xmm10, [rsp + XMM_SAVE + 16*4]
        vmovdqa         xmm11, [rsp + XMM_SAVE + 16*5]
        vmovdqa         xmm12, [rsp + XMM_SAVE + 16*6]
        vmovdqa         xmm13, [rsp + XMM_SAVE + 16*7]
        add             rsp, VARIABLE_OFFSET
        ret


;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

align 16
.less_than_256:

        ; check if there is enough buffer to be able to fold 16B at a time
        cmp     arg3, 32
        jl      .less_than_32

        ; if there is, load the constants
        vmovdqa xmm10, [rk1]    ; rk1 and rk2 in xmm10

        vmovd   xmm0, arg1_low32        ; get the initial crc value
        vmovdqu xmm7, [arg2]            ; load the plaintext
        vpxor   xmm7, xmm0

        ; update the buffer pointer
        add     arg2, 16

        ; update the counter. subtract 32 instead of 16 to save one instruction from the loop
        sub     arg3, 32

        jmp     .16B_reduction_loop


align 16
.less_than_32:
        ; mov initial crc to the return value. this is necessary for zero-length buffers.
        mov     eax, arg1_low32
        test    arg3, arg3
        je      .cleanup

        vmovd   xmm0, arg1_low32        ; get the initial crc value

        cmp     arg3, 16
        je      .exact_16_left
        jl      .less_than_16_left

        vmovdqu xmm7, [arg2]            ; load the plaintext
        vpxor   xmm7, xmm0              ; xor the initial crc value
        add     arg2, 16
        sub     arg3, 16
        vmovdqa xmm10, [rk1]            ; rk1 and rk2 in xmm10
        jmp     .get_last_two_xmms

align 16
.less_than_16_left:
        ; use stack space to load data less than 16 bytes, zero-out the 16B in memory first.

        vpxor   xmm1, xmm1
        mov     r11, rsp
        vmovdqa [r11], xmm1

        cmp     arg3, 4
        jl      .only_less_than_4

        ; backup the counter value
        mov     r9, arg3
        cmp     arg3, 8
        jl      .less_than_8_left

        ; load 8 Bytes
        mov     rax, [arg2]
        mov     [r11], rax
        add     r11, 8
        sub     arg3, 8
        add     arg2, 8
.less_than_8_left:

        cmp     arg3, 4
        jl      .less_than_4_left

        ; load 4 Bytes
        mov     eax, [arg2]
        mov     [r11], eax
        add     r11, 4
        sub     arg3, 4
        add     arg2, 4
.less_than_4_left:

        cmp     arg3, 2
        jl      .less_than_2_left

        ; load 2 Bytes
        mov     ax, [arg2]
        mov     [r11], ax
        add     r11, 2
        sub     arg3, 2
        add     arg2, 2
.less_than_2_left:
        cmp     arg3, 1
        jl      .zero_left

        ; load 1 Byte
        mov     al, [arg2]
        mov     [r11], al

.zero_left:
        vmovdqa xmm7, [rsp]
        vpxor   xmm7, xmm0      ; xor the initial crc value

        lea     rax,[pshufb_shf_table]
        vmovdqu xmm0, [rax + r9]
        vpshufb xmm7,xmm0
        jmp     .128_done

align 16
.exact_16_left:
        vmovdqu xmm7, [arg2]
        vpxor   xmm7, xmm0      ; xor the initial crc value
        jmp     .128_done

.only_less_than_4:
        cmp     arg3, 3
        jl      .only_less_than_3

        ; load 3 Bytes
        mov     al, [arg2]
        mov     [r11], al

        mov     al, [arg2+1]
        mov     [r11+1], al

        mov     al, [arg2+2]
        mov     [r11+2], al

        vmovdqa xmm7, [rsp]
        vpxor   xmm7, xmm0      ; xor the initial crc value

        vpslldq xmm7, 5
        jmp     .barrett

.only_less_than_3:
        cmp     arg3, 2
        jl      .only_less_than_2

        ; load 2 Bytes
        mov     al, [arg2]
        mov     [r11], al

        mov     al, [arg2+1]
        mov     [r11+1], al

        vmovdqa xmm7, [rsp]
        vpxor   xmm7, xmm0      ; xor the initial crc value

        vpslldq xmm7, 6
        jmp     .barrett

.only_less_than_2:
        ; load 1 Byte
        mov     al, [arg2]
        mov     [r11], al

        vmovdqa xmm7, [rsp]
        vpxor   xmm7, xmm0      ; xor the initial crc value

        vpslldq xmm7, 7
        jmp     .barrett


; precomputed constants
align 16
rk1:  dq 0x00000000ccaa009e
rk2:  dq 0x00000001751997d0
rk3:  dq 0x000000014a7fe880
rk4:  dq 0x00000001e88ef372
rk5:  dq 0x00000000ccaa009e
rk6:  dq 0x0000000163cd6124
rk7:  dq 0x00000001f7011640
rk8:  dq 0x00000001db710640
rk9:  dq 0x00000001d7cfc6ac
rk10: dq 0x00000001ea89367e
rk11: dq 0x000000018cb44e58
rk12: dq 0x00000000df068dc2
rk13: dq 0x00000000ae0b5394
rk14: dq 0x00000001c7569e54
rk15: dq 0x00000001c6e41596
rk16: dq 0x0000000154442bd4
rk17: dq 0x0000000174359406
rk18: dq 0x000000003db1ecdc
rk19: dq 0x000000015a546366
rk20: dq 0x00000000f1da05aa

mask:  dq     0xFFFFFFFFFFFFFFFF, 0x0000000000000000
mask2: dq     0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF
mask3: dq     0x8080808080808080, 0x8080808080808080

pshufb_shf_table:
; use these values for shift constants for the pshufb instruction
; different alignments result in values as shown:
;       dq 0x8887868584838281, 0x008f8e8d8c8b8a89 ; shl 15 (16-1) / shr1
;       dq 0x8988878685848382, 0x01008f8e8d8c8b8a ; shl 14 (16-3) / shr2
;       dq 0x8a89888786858483, 0x0201008f8e8d8c8b ; shl 13 (16-4) / shr3
;       dq 0x8b8a898887868584, 0x030201008f8e8d8c ; shl 12 (16-4) / shr4
;       dq 0x8c8b8a8988878685, 0x04030201008f8e8d ; shl 11 (16-5) / shr5
;       dq 0x8d8c8b8a89888786, 0x0504030201008f8e ; shl 10 (16-6) / shr6
;       dq 0x8e8d8c8b8a898887, 0x060504030201008f ; shl 9  (16-7) / shr7
;       dq 0x8f8e8d8c8b8a8988, 0x0706050403020100 ; shl 8  (16- / shr8
;       dq 0x008f8e8d8c8b8a89, 0x0807060504030201 ; shl 7  (16-9) / shr9
;       dq 0x01008f8e8d8c8b8a, 0x0908070605040302 ; shl 6  (16-10) / shr10
;       dq 0x0201008f8e8d8c8b, 0x0a09080706050403 ; shl 5  (16-11) / shr11
;       dq 0x030201008f8e8d8c, 0x0b0a090807060504 ; shl 4  (16-12) / shr12
;       dq 0x04030201008f8e8d, 0x0c0b0a0908070605 ; shl 3  (16-13) / shr13
;       dq 0x0504030201008f8e, 0x0d0c0b0a09080706 ; shl 2  (16-14) / shr14
;       dq 0x060504030201008f, 0x0e0d0c0b0a090807 ; shl 1  (16-15) / shr15
dq 0x8786858483828100, 0x8f8e8d8c8b8a8988
dq 0x0706050403020100, 0x000e0d0c0b0a0908

The Intel version is magnitudes faster than the naive version (> 100x faster).
However, Windows api ntdll\RtlComputeCrc32 is already a very optimised crc32 which can be even faster then the Intel version I presented above (it is faster in a newer Windows 11 PC than the Intel version but slower in an older Windows 10 PC).

I only tested the one intel version. There are other versions for e.g. AVX512 that I cannot test that could be faster than the Windows api.

08 Dec 2025, 10:48

Ali.Z

Joined: 08 Jan 2018
Posts: 846

Ali.Z 18 Dec 2025, 00:38

i updated my initial post with an updated and optimized version of my earlier implementation. (there is more room to optimize)

_________________
Asm For Wise Humans

18 Dec 2025, 00:38

wht36

Joined: 18 Sep 2005
Posts: 110

wht36 18 Dec 2025, 06:58

sylware wrote:

that's why I was asking about an AVX2 implementation of zlib crc32.

I haven't found an AVX2 implementation. I think using ymm with vpclmulqdq requires at least AVX-512VL, which is only available in some CPU models.

18 Dec 2025, 06:58

< Last Thread | Next Thread >

Forum Rules:

You cannot post new topics in this forum
You cannot reply to topics in this forum
You cannot edit your posts in this forum
You cannot delete your posts in this forum
You cannot vote in polls in this forum
You cannot attach files in this forum
You can download files in this forum