4
\$\begingroup\$

Below is a complete implementation of RC4 in ARM assembly. Unit tests pass, including comparison to C implementation and to published test vectors.

I'm appreciative of all feedback, comments (things you like and things you don't), and suggestions, but particularly value your thoughts on:

  • Performance. Could the code be faster?
  • Clarity. How it could it be clearer? Use of idioms and patterns?
  • Comments. In C, you can name variables appropriately to make code self-documenting, whereas in ASM you generally stick with the register names. So I comment ASM in detail, but endeavor to do so in a way which doesn't interfere with the code, become cumbersome, or potentially desync. I enumerate what I do with each reg, and try to use comments to keep the asm connected to the higher level C it might implement. How well does this work?

I use GNU gas assembler, which allows comments to begin with # or, in ARM, @. My editor doesn't recognize the latter, so I add ; to make the syntax highlighting work. Pasting the code into Stackoverflow seems to change the alignment, so please forgive the extra spacing on some lines.

Usage: For one-shot invocation, rc4() is self contained. When iterating through streams, use ksa() once and then prga() on each successive buffer.

# rc4

.arm
.text

# void rc4_a(const char* key, uint32_t key_len, uint8_t* buf, uint32_t buf_len);
.global rc4_a
rc4_a:
    .equ _pi, 0x00
    .equ _pj, 0x04
    .equ _S,  0x08
    push {r4-r7, lr}
    sub sp, sp, #(2 * 4 + 0x100)
    mov r4, #0
    str r4, [sp, #_pi]
    str r4, [sp, #_pj]

    mov r4, r0
    mov r5, r1
    mov r6, r2
    mov r7, r3
    add r2, sp, #_S 
    bl ksa_a
    mov r0, r6
    mov r1, r7
    add r2, sp, #_S
    add r3, sp, #_pi
    add r4, sp, #_pj
    push {r4} 
    bl prga_a
    add sp, sp, #(3 * 4 + 0x100)
    pop {r4-r7, lr}
    bx lr

# void prga_a(uint8_t* buf, uint32_t buf_len, uint8_t S[256], uint8_t* i, uint8_t* j);
.global prga_a
prga_a:
# i := 0
# j := 0
# while GeneratingOutput:
#   i := (i + 1) mod 256
#   j := (j + S[i]) mod 256
#   swap values of S[i] and S[j]
#   K := S[(S[i] + S[j]) mod 256]
#   output K

        # r0: unsigned char* buf
        # r1: Initially: buf_len
        # r1: During execution: buf + buf_len
        # r2: char S[256]
        # r3: Initially: &i_
        # r3: During execution: i
        # sp: Initially: &j_
        # r4: j
        # r5: S[i]
        # r6: S[j]
        # r7: S[S[i]+S[j]]
        # r8: *(buf++)
        .equ _pi, ((0)*4)               @; r3=&i, since r3 is lowest reg, it will push to sp
        .equ _pj, ((8-3+1)*4)   @; &j is at sp before pushing r3-r8 (ie 6 words)

        cmp r1, #0
        bxeq lr                 @; if (buf_len == 0) return
        add r1, r1, r0  @; r1 = buf + buf_len
        push {r3-r8}
        ldr r3, [sp, #_pi]
        ldrb r3, [r3]   @; r3 = i
        ldr r4, [sp, #_pj]
        ldrb r4, [r4]   @; r4 = j

.L3:                    @; PERFORMANCE: Consider unrolling this loop, 
                        @;              to allow ldr/str of more than 1 byte
                        @;              of buf at a time.
        add r3, r3, #1          @; i++
        and r3, r3, #0xff   @; i %= 256
        ldrb r5, [r2, r3]   @; r5 = S[i]
        add r4, r4, r5      @; j += S[i]
        and r4, r4, #0xff   @; j %= 256

    ldrb r6, [r2, r4]   @; swap(&S[i], &S[j])
    strb r5, [r2, r4]
    strb r6, [r2, r3]

    add r7, r5, r6         @; r7 = (S[i] + S[j])
    and r7, #0xff      @;      % 256
    ldrb r7, [r2, r7]  @; r7 = S[r7]
        ldrb r8, [r0], #1  @; r8 = *(buf++)
        eor r8, r8, r7     @; r8 ^= S[(S[i] + S[j]) % 256]
        strb r8, [r0, #-1] @; *(buf-1) ^= S[(S[i] + S[j]) % 256]
        cmp r0, r1                
        blt .L3

        ldr r6, [sp, #_pi]
        ldr r7, [sp, #_pj]
        strb r3, [r6]      @; *i_ = i
        strb r4, [r7]      @; *j_ = j
        pop {r3-r8}
        bx lr

# void ksa_a(const char* key, uint32_t key_len, uint8_t S[256]);
.global ksa_a
ksa_a:
# for i from 0 to 255
#    S[i] := i
# j := 0
# for i from 0 to 255
#    j := (j + S[i] + key[i mod keylength]) mod 256
#    swap values of S[i] and S[j]

    # r0: const char* key
    # r1: unsigned int key_len
    # r2: unsigned char[256] S
    # r3: int i

    push {r4, r5, r6, r7, r8}
    mov r3, #0         @; i = 0
.L0:
    strb r3, [r2, r3]  @; S[i] = i
    add r3, r3, #1     @; i++
    cmp r3, #0x100     @; while (i < 256)
    blt .L0

    cmp r1, #0         @; if (key_len == 0) return
    beq .L2

    # r4: unsigned char j
    # r5: unsigned int k // Points to byte of key
    # r6: S[i]
    # r7: S[j]
    # r8: key[k]
    mov r3, #0                 @; i = 0
    mov r4, #0                 @; j = 0
    mov r5, #0         @; k = 0
.L1:
    # j += (S[i] + key[k % key_len])
    ldrb r6, [r2, r3]  @; r6 = S[i]
    ldrb r8, [r0, r5]  @; r8 = key[k]
    add r4, r4, r6     @; j += S[i]
    add r4, r4, r8     @; j += key[k]
    and r4, r4, #0xff  @; j %= 256
    # swap(&S[i], &S[j])
    ldrb r7, [r2, r4]
    strb r6, [r2, r4]
    strb r7, [r2, r3]
    add r3, r3, #1     @; i++
    add r5, r5, #1         @; k++
    cmp r5, r1         @; k %= key_len
    moveq r5, #0
    cmp r3, #0x100     @; while (i < 256)
    blt .L1

.L2:
    pop {r4, r5, r6, r7, r8}            
    bx lr
\$\endgroup\$
3
  • 3
    \$\begingroup\$ It may help you get better reviews if you include the unit tests here. And reviewers might improve the testing, too! \$\endgroup\$ Commented Jan 16, 2022 at 16:40
  • 1
    \$\begingroup\$ You might want to add a like to a description of what RC4 is for those of us who don't know. \$\endgroup\$ Commented Jan 17, 2022 at 1:16
  • \$\begingroup\$ en.wikipedia.org/wiki/RC4 \$\endgroup\$ Commented Feb 4, 2022 at 1:56

0

You must log in to answer this question.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.