32-bit AES

This section explains how to use the aes32esmi, aes32esi, aes32dsi and aes32dsmi instructions in the Zkne extension to simplify and speed up the implementation of the Advanced Encryption Standard. The instructions can be used to implement AES128, AES196 and AES256. A talk at the RISC-V summit1 claims a speed-up of ~4x and a code size reduction of 0.3x1.

⚠️ WARNING ⚠️

It is especially difficult to implement cryptography correctly and securely. If you can use a existing implementation that has been battle tested, you probably should. Still, this page exists to show how you would go about using this extension.

Encryption

The aes32esmi instruction helps with implementing the middle rounds of AES. It performs a byte substitution, mixing of columns and adding the roundkey. The aes32esi instruction is used for the last round of the AES and performs a byte substitution and adding the roundkey. An rust equivalent implementation of the instructions would look like:

#![allow(unused)]
fn main() {
static SBOX: [u8; 256] = [
    // ...
];

fn xt2(x: u8) -> u8 {
    (x << 1) ^ if x & 0x80 != 0 { 0x1B } else { 0x00 }
}

// Galois Field Multiplication for y in [[0..16]]
fn gfmul(x: u8, y: u8) -> u8 {
    let mut out = 0;
    let mut mask = x;

    for i in 0..4 {
        if y & (1 << i) != 0 {
            out ^= mask;
        }

        mask = xt2(x);
    }

    mask
}

fn aes32esmi(rs1: u32, rs2: u32, bs: u8) -> u32 {
    let shift_amount = bs * 8;

    // Substitution
    let sub_input = (rs2 >> shift_amount) & 0xFF;
    let sub_output = SBOX[sub_input as usize] as u8;

    // Mix Columns
    let mixed = u32::from_be_bytes([
        gfmul(sub_output, 0x3),
        sub_output,
        sub_output,
        gfmul(sub_output, 0x2),
    ]);

    // Add Roundkey
    rs1 ^ mixed.rotate_left(shift_amount)
}

fn aes32esi(rs1: u32, rs2: u32, bs: u8) -> u32 {
    let shift_amount = bs * 8;

    // Substitution
    let sub_input = (rs2 >> shift_amount) & 0xFF;
    let sub_output = SBOX[sub_input as usize] as u32;

    // Add Roundkey
    rs1 ^ (sub_output << shift_amount)
}
}

Middle Round implementation

This can be used to implement an encryption middle encryption round, where rk is an array of the roundkeys and block is the input state. Note, how in the following code example it manually handles the shifting of rows.

#![allow(unused)]
fn main() {
// Block and RoundKey contain little-endian encoded rows
let RoundKey(mut a0, mut a1, mut a2, mut a3) = rk[i];

a0 = aes32esmi(a0, block.0, 0);
a0 = aes32esmi(a0, block.1, 1);
a0 = aes32esmi(a0, block.2, 2);
a0 = aes32esmi(a0, block.3, 3);

a1 = aes32esmi(a1, block.1, 0);
a1 = aes32esmi(a1, block.2, 1);
a1 = aes32esmi(a1, block.3, 2);
a1 = aes32esmi(a1, block.0, 3);

a2 = aes32esmi(a2, block.2, 0);
a2 = aes32esmi(a2, block.3, 1);
a2 = aes32esmi(a2, block.0, 2);
a2 = aes32esmi(a2, block.1, 3);

a3 = aes32esmi(a3, block.3, 0);
a3 = aes32esmi(a3, block.0, 1);
a3 = aes32esmi(a3, block.1, 2);
a3 = aes32esmi(a3, block.2, 3);

block = Block(a0, a1, a2, a3);
}

Final Round implementation

Similarly to the Middle Round implementation, the final round is implemented. Here, the aes32esmi instruction is replaced by the aes32esi instruction.

#![allow(unused)]
fn main() {
// Block and RoundKey contain little-endian encoded rows
let RoundKey(mut a0, mut a1, mut a2, mut a3) = rk[i];

a0 = aes32esi(a0, block.0, 0);
a0 = aes32esi(a0, block.1, 1);
a0 = aes32esi(a0, block.2, 2);
a0 = aes32esi(a0, block.3, 3);

a1 = aes32esi(a1, block.1, 0);
a1 = aes32esi(a1, block.2, 1);
a1 = aes32esi(a1, block.3, 2);
a1 = aes32esi(a1, block.0, 3);

a2 = aes32esi(a2, block.2, 0);
a2 = aes32esi(a2, block.3, 1);
a2 = aes32esi(a2, block.0, 2);
a2 = aes32esi(a2, block.1, 3);

a3 = aes32esi(a3, block.3, 0);
a3 = aes32esi(a3, block.0, 1);
a3 = aes32esi(a3, block.1, 2);
a3 = aes32esi(a3, block.2, 3);

block = Block(a0, a1, a2, a3);
}

Decryption

#![allow(unused)]

fn main() {
}

Key Schedule implementation

To implement the key schedule, we can also use the aes32esi instruction. This prevents the need for a substitution table in software. The implementation differs slightly between AES128, AES196 and AES256 and therefore all three implementations are given separately.

#![allow(unused)]
fn main() {
pub struct AES128Key(u32, u32, u32, u32);
pub struct AES196Key(u32, u32, u32, u32, u32, u32);
pub struct AES256Key(u32, u32, u32, u32, u32, u32, u32, u32);

pub struct RoundKey(u32, u32, u32, u32);

fn aes128_key_schedule(ck: AES128Key) -> [RoundKey; 11] {
    let mut rk = [0u32; 11 * 4];

    let AES128Key(
        mut t0, mut t1,
        mut t2, mut t3,
    ) = ck;

    let mut i = 0;
    loop {
        rk[(i << 2) + 0] = t0;
        rk[(i << 2) + 1] = t1;
        rk[(i << 2) + 2] = t2;
        rk[(i << 2) + 3] = t3;

        if i == 10 {
            break;
        }

        t0 ^= u32::from(RCON[i]);
        let tr = t3.rotate_right(8);

        t0 = aes32esi(t0, tr, 0);
        t0 = aes32esi(t0, tr, 1);
        t0 = aes32esi(t0, tr, 2);
        t0 = aes32esi(t0, tr, 3);

        t1 ^= t0;
        t2 ^= t1;
        t3 ^= t2;

        i += 1;
    }

    // SAFETY: We know that rk has 13 * 4 times a u32. So it has space for 13 RoundKeys
    unsafe { core::mem::transmute(rk) }
}

fn aes196_key_schedule(ck: AES196Key) -> [RoundKey; 13] {
    let mut rk = [0u32; 13 * 4];

    let AES196Key(
        mut t0, mut t1,
        mut t2, mut t3,
        mut t4, mut t5,
    ) = ck;

    let mut i = 0;
    loop {
        rk[i * 6 + 0] = t0;
        rk[i * 6 + 1] = t1;
        rk[i * 6 + 2] = t2;
        rk[i * 6 + 3] = t3;

        if i == 8 {
            break;
        }

        rk[i * 6 + 4] = t4;
        rk[i * 6 + 5] = t5;

        t0 ^= u32::from(RCON[i]);
        let tr = t5.rotate_right(8);

        t0 = aes32esi(t0, tr, 0);
        t0 = aes32esi(t0, tr, 1);
        t0 = aes32esi(t0, tr, 2);
        t0 = aes32esi(t0, tr, 3);

        t1 ^= t0;
        t2 ^= t1;
        t3 ^= t2;
        t4 ^= t3;
        t5 ^= t4;

        i += 1;
    }

    // SAFETY: We know that rk has 13 * 4 times a u32. So it has space for 13 RoundKeys
    unsafe { core::mem::transmute(rk) }
}

fn aes256_key_schedule(ck: AES256Key) -> [RoundKey; 15] {
    let mut rk = [0u32; 15 * 4];

    let AES256Key(
        mut t0, mut t1,
        mut t2, mut t3,
        mut t4, mut t5,
        mut t6, mut t7,
    ) = ck;

    let mut i = 0;
    loop {
        rk[i * 8 + 0] = t0;
        rk[i * 8 + 1] = t1;
        rk[i * 8 + 2] = t2;
        rk[i * 8 + 3] = t3;

        if i == 7 {
            break;
        }

        rk[i * 8 + 4] = t4;
        rk[i * 8 + 5] = t5;
        rk[i * 8 + 6] = t6;
        rk[i * 8 + 7] = t7;

        t0 ^= u32::from(RCON[i]);
        let tr = t7.rotate_right(8);

        t0 = aes32esi(t0, tr, 0);
        t0 = aes32esi(t0, tr, 1);
        t0 = aes32esi(t0, tr, 2);
        t0 = aes32esi(t0, tr, 3);

        t1 ^= t0;
        t2 ^= t1;
        t3 ^= t2;

        t4 = aes32esi(t4, t3, 0);
        t4 = aes32esi(t4, t3, 1);
        t4 = aes32esi(t4, t3, 2);
        t4 = aes32esi(t4, t3, 3);

        t5 ^= t4;
        t6 ^= t5;
        t7 ^= t6;

        i += 1;
    }

    // SAFETY: We know that rk has 15 * 4 times a u32. So it has space for 15 RoundKeys
    unsafe { core::mem::transmute(rk) }
}

fn aes_decrypt_key_schedule<const KEYS: usize>(rk: &mut [RoundKey; KEYS]) {
    fn subkey(mut x: u32) -> u32 {
        let mut y;

        unsafe {
            y = aes32esi(0, x, 0);
            y = aes32esi(y, x, 1);
            y = aes32esi(y, x, 2);
            y = aes32esi(y, x, 3);

            x = aes32dsmi(0, y, 0);
            x = aes32dsmi(x, y, 1);
            x = aes32dsmi(x, y, 2);
            x = aes32dsmi(x, y, 3);
        }

        x
    }

    for k in &mut rk[1..KEYS - 1] {
        unsafe {
            k.0 = subkey(k.0);
            k.1 = subkey(k.1);
            k.2 = subkey(k.2);
            k.3 = subkey(k.3);
        }
    }
}

fn aes128_decrypt_key_schedule(rk: &mut [RoundKey; 11]) {
    aes_decrypt_key_schedule::<11>(rk)
}

fn aes196_decrypt_key_schedule(rk: &mut [RoundKey; 13]) {
    aes_decrypt_key_schedule::<13>(rk)
}

fn aes256_decrypt_key_schedule(rk: &mut [RoundKey; 15]) {
    aes_decrypt_key_schedule::<15>(rk)
}
}