Below is a complete implementation of RC4 in ARM assembly. Unit tests pass, including comparison to C implementation and to published test vectors.
I'm appreciative of all feedback, comments (things you like and things you don't), and suggestions, but particularly value your thoughts on:
- Performance. Could the code be faster?
- Clarity. How it could it be clearer? Use of idioms and patterns?
- Comments. In C, you can name variables appropriately to make code self-documenting, whereas in ASM you generally stick with the register names. So I comment ASM in detail, but endeavor to do so in a way which doesn't interfere with the code, become cumbersome, or potentially desync. I enumerate what I do with each reg, and try to use comments to keep the asm connected to the higher level C it might implement. How well does this work?
I use GNU gas assembler, which allows comments to begin with # or, in ARM, @. My editor doesn't recognize the latter, so I add ; to make the syntax highlighting work. Pasting the code into Stackoverflow seems to change the alignment, so please forgive the extra spacing on some lines.
Usage: For one-shot invocation, rc4() is self contained. When iterating through streams, use ksa() once and then prga() on each successive buffer.
# rc4
.arm
.text
# void rc4_a(const char* key, uint32_t key_len, uint8_t* buf, uint32_t buf_len);
.global rc4_a
rc4_a:
.equ _pi, 0x00
.equ _pj, 0x04
.equ _S, 0x08
push {r4-r7, lr}
sub sp, sp, #(2 * 4 + 0x100)
mov r4, #0
str r4, [sp, #_pi]
str r4, [sp, #_pj]
mov r4, r0
mov r5, r1
mov r6, r2
mov r7, r3
add r2, sp, #_S
bl ksa_a
mov r0, r6
mov r1, r7
add r2, sp, #_S
add r3, sp, #_pi
add r4, sp, #_pj
push {r4}
bl prga_a
add sp, sp, #(3 * 4 + 0x100)
pop {r4-r7, lr}
bx lr
# void prga_a(uint8_t* buf, uint32_t buf_len, uint8_t S[256], uint8_t* i, uint8_t* j);
.global prga_a
prga_a:
# i := 0
# j := 0
# while GeneratingOutput:
# i := (i + 1) mod 256
# j := (j + S[i]) mod 256
# swap values of S[i] and S[j]
# K := S[(S[i] + S[j]) mod 256]
# output K
# r0: unsigned char* buf
# r1: Initially: buf_len
# r1: During execution: buf + buf_len
# r2: char S[256]
# r3: Initially: &i_
# r3: During execution: i
# sp: Initially: &j_
# r4: j
# r5: S[i]
# r6: S[j]
# r7: S[S[i]+S[j]]
# r8: *(buf++)
.equ _pi, ((0)*4) @; r3=&i, since r3 is lowest reg, it will push to sp
.equ _pj, ((8-3+1)*4) @; &j is at sp before pushing r3-r8 (ie 6 words)
cmp r1, #0
bxeq lr @; if (buf_len == 0) return
add r1, r1, r0 @; r1 = buf + buf_len
push {r3-r8}
ldr r3, [sp, #_pi]
ldrb r3, [r3] @; r3 = i
ldr r4, [sp, #_pj]
ldrb r4, [r4] @; r4 = j
.L3: @; PERFORMANCE: Consider unrolling this loop,
@; to allow ldr/str of more than 1 byte
@; of buf at a time.
add r3, r3, #1 @; i++
and r3, r3, #0xff @; i %= 256
ldrb r5, [r2, r3] @; r5 = S[i]
add r4, r4, r5 @; j += S[i]
and r4, r4, #0xff @; j %= 256
ldrb r6, [r2, r4] @; swap(&S[i], &S[j])
strb r5, [r2, r4]
strb r6, [r2, r3]
add r7, r5, r6 @; r7 = (S[i] + S[j])
and r7, #0xff @; % 256
ldrb r7, [r2, r7] @; r7 = S[r7]
ldrb r8, [r0], #1 @; r8 = *(buf++)
eor r8, r8, r7 @; r8 ^= S[(S[i] + S[j]) % 256]
strb r8, [r0, #-1] @; *(buf-1) ^= S[(S[i] + S[j]) % 256]
cmp r0, r1
blt .L3
ldr r6, [sp, #_pi]
ldr r7, [sp, #_pj]
strb r3, [r6] @; *i_ = i
strb r4, [r7] @; *j_ = j
pop {r3-r8}
bx lr
# void ksa_a(const char* key, uint32_t key_len, uint8_t S[256]);
.global ksa_a
ksa_a:
# for i from 0 to 255
# S[i] := i
# j := 0
# for i from 0 to 255
# j := (j + S[i] + key[i mod keylength]) mod 256
# swap values of S[i] and S[j]
# r0: const char* key
# r1: unsigned int key_len
# r2: unsigned char[256] S
# r3: int i
push {r4, r5, r6, r7, r8}
mov r3, #0 @; i = 0
.L0:
strb r3, [r2, r3] @; S[i] = i
add r3, r3, #1 @; i++
cmp r3, #0x100 @; while (i < 256)
blt .L0
cmp r1, #0 @; if (key_len == 0) return
beq .L2
# r4: unsigned char j
# r5: unsigned int k // Points to byte of key
# r6: S[i]
# r7: S[j]
# r8: key[k]
mov r3, #0 @; i = 0
mov r4, #0 @; j = 0
mov r5, #0 @; k = 0
.L1:
# j += (S[i] + key[k % key_len])
ldrb r6, [r2, r3] @; r6 = S[i]
ldrb r8, [r0, r5] @; r8 = key[k]
add r4, r4, r6 @; j += S[i]
add r4, r4, r8 @; j += key[k]
and r4, r4, #0xff @; j %= 256
# swap(&S[i], &S[j])
ldrb r7, [r2, r4]
strb r6, [r2, r4]
strb r7, [r2, r3]
add r3, r3, #1 @; i++
add r5, r5, #1 @; k++
cmp r5, r1 @; k %= key_len
moveq r5, #0
cmp r3, #0x100 @; while (i < 256)
blt .L1
.L2:
pop {r4, r5, r6, r7, r8}
bx lr