Pipline_Arm_CPU/tools/sim/Benchmarks/test10_forwarding.arm

150 lines
7.7 KiB
Plaintext

// Test of CBZ and B instruction.
// Requires:
// ADDI, ADDS, SUB, CBZ, B.LT, B, LDUR, STUR
// Expected results:
// X0 = 0
// X1 = 8
// X2 = 4 (on pipelined CPU), or 0 (single-cycle CPU).
// X3 = 5
// X4 = 7
// X5 = 2
// X6 = -2
// X7 = -2
// X8 = 0
// X9 = 1
// X10 = -4
// X14 = 5
// X15 = 8
// X16 = 9
// X17 = 1
// X18 = 99
// Mem[0] = 8
// Mem[8] = 5
//ADDI: I-type, Reg[Rd] = Reg[Rn] + {'0, Imm12}
//OP Imm12 Rn Rd
//3322222222 221111111111 00000 00000
//1098765432 109876543210 98765 43210
//1001000100 Unsigned 0..31 0..31
//B: B-type, PC = PC + SignExtend({Imm26, 2'b00})
//OP Imm26
//332222 22222211111111110000000000
//109876 54321098765432109876543210
//000101 2's Comp Imm26
//CBZ: CB-type, if (R[Rt] == 0) PC = PC + SignExtend({Imm19, 2'b00})
//OP Imm19 Rt
//33222222 2222111111111100000 00000
//10987654 3210987654321098765 43210
//10110100 2's Comp Imm19 0..31
//SUBS: R-type, Reg[Rd] = Reg[Rn] - Reg[Rm]
//OP Rm Shamt Rn Rd
//33222222222 21111 111111 00000 00000
//10987654321 09876 543210 98765 43210
//11101011000 0..31 000000 0..31 0..31
//ADDS: R-type, Reg[Rd] = Reg[Rn] + Reg[Rm]
//OP Rm Shamt Rn Rd
//33222222222 21111 111111 00000 00000
//10987654321 09876 543210 98765 43210
//10101011000 0..31 000000 0..31 0..31
//B.LT: CB-type, if (flags meet condition) PC = PC + SignExtend({Imm19, 2'b00})
//OP Imm19 Cond
//33222222 2222111111111100000 00000
//10987654 3210987654321098765 43210
//01010100 2's Comp Imm19 01011
//LDUR: D-type, Reg[Rt] = Mem[Reg[Rn] + SignExtend(Imm9)]
//OP Imm9 00 Rn Rt
//33222222222 211111111 11 00000 00000
//10987654321 098765432 10 98765 43210
//11111000010 2's Comp 00 0..31 0..31
//STUR: D-type, Mem[Reg[Rn] + SignExtend(Imm9)] = Reg[Rt]
//OP Imm9 00 Rn Rt
//33222222222 211111111 11 00000 00000
//10987654321 098765432 10 98765 43210
//11111000000 2's Comp 00 0..31 0..31
// MAIN:
1001000100_000000000000_11111_00000 // ADDI X0, X31, #0 // X0 = 0
1001000100_000000000000_11111_00001 // ADDI X1, X31, #0 // X1 = 0
1001000100_000000000000_11111_00010 // ADDI X2, X31, #0 // X2 = 0, counter of branch delay slots.
// // Simple forwarding
1001000100_000000000101_11111_00011 // ADDI X3, X31, #5 // X3 = 5
1001000100_000000000010_00011_00100 // ADDI X4, X3, #2 // X4 = 7
11101011000_00011_000000_00100_00101 // SUBS X5, X4, X3 // X5 = 2
11101011000_00100_000000_00011_00110 // SUBS X6, X3, X4 // X6 = -2
11101011000_00100_000000_00011_00111 // SUBS X7, X3, X4 // X7 = -2
// // Forwarding and X31
1001000100_111111111111_11111_11111 // ADDI X31, X31, #-1 // Writing -1 to X31, but it should stay as 0
11101011000_11111_000000_00001_01000 // SUBS X8, X1, X31 // X8 = 0
11101011000_01000_000000_11111_01000 // SUBS X8, X31, X8 // X8 = 0
11101011000_11111_000000_01000_01000 // SUBS X8, X8, X31 // X8 = 0
11101011000_11111_000000_01000_01000 // SUBS X8, X8, X31 // X8 = 0
// // Forwarding in the face of multiple writes
1001000100_000000000010_11111_01001 // ADDI X9, X31, #2 // X9 = 2
1001000100_000000000001_11111_01001 // ADDI X9, X31, #1 // X9 = 1
11101011000_01001_000000_11111_01010 // SUBS X10, X31, X9 // X10 = -1
11101011000_01001_000000_01010_01010 // SUBS X10, X10, X9 // X10 = -2
11101011000_01001_000000_01010_01010 // SUBS X10, X10, X9 // X10 = -3
11101011000_01001_000000_01010_01010 // SUBS X10, X10, X9 // X10 = -4
// // Forwarding involving an instruction that doesn't write the register file
11111000000_000000001_00_00100_00011 // STUR X3, [X4, #1] // Mem[8] = 5
1001000100_000000000000_00011_01110 // ADDI X14, X3, #0 // X14 = 5
// // Forwarding and load/store instructions
1001000100_000000001000_11111_00001 // ADDI X1, X31, 8 // X1 = 8
11111000000_000000000_00_11111_00001 // STUR X1, [X31, #0] // Mem[0] = 8
11111000010_000000000_00_11111_01111 // LDUR X15, [X31, #0] // X15 = Mem[0] = 8
1001000100_000000000000_11111_11111 // ADDI X31, X31, 0 // Noop
1001000100_000000000001_01111_10000 // ADDI X16, X15, 1 // X16 = 9
// // Flags and the pipelined CPU (set flag and quickly or slowly branch).
10101011000_11111_000000_11111_11111 // ADDS X31, X31, X31 // Noop that sets all flags to 0.
11101011000_00011_000000_11111_11111 // SUBS X31, X31, X3 // Yes, 0 < 5. Set flags
01010100_0000000000000000100_01011 // B.LT TAKEN1 // Take the branch (+4). pc=112
1001000100_000000000001_00010_00010 // ADDI X2, X2, #1 // X2 = 1 (increment delay slot counter)
// ERROR1:
000101_00000000000000000000000000 // B ERROR1 // Should never get here (0).
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop
// TAKEN1:
10101011000_11111_000000_11111_11111 // ADDS X31, X31, X31 // Noop that sets all flags to 0.
11101011000_00011_000000_11111_11111 // SUBS X31, X31, X3 // Yes, 0 < 5. Set flags
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop - same as above but 1 cycle later.
01010100_0000000000000000100_01011 // B.LT TAKEN2 // Take the branch (+4).
1001000100_000000000001_00010_00010 // ADDI X2, X2, #1 // X2 = 2 (increment delay slot counter)
// ERROR2:
000101_00000000000000000000000000 // B ERROR2 // Should never get here (0).
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop
// TAKEN2:
10101011000_11111_000000_11111_11111 // ADDS X31, X31, X31 // Noop that sets all flags to 0. pc = 156
11101011000_00011_000000_11111_11111 // SUBS X31, X31, X3 // Yes, 0 < 5. Set flags
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop - same as above but much longer.
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop
01010100_0000000000000000100_01011 // B.LT TAKEN3 // Take the branch (+4). pc = 180
1001000100_000000000001_00010_00010 // ADDI X2, X2, #1 // X2 = 3 (increment delay slot counter)
// ERROR3:
000101_00000000000000000000000000 // B ERROR3 // Should never get here (0).
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop
// TAKEN3:
1001000100_000000000001_11111_10001 // ADDI X17, X31, #1 // X17 = 1 pc = 196
// // Forwarding to conditional branch
1001000100_000000000010_11111_00000 // ADDI X0, X31, #2 // X0 = 2
10110100_0000000000000000101_00000 // CBZ X0, ERROR4 // Should not be taken (+5).
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop.
1001000100_000000000000_11111_00000 // ADDI X0, X31, #0 // X0 = 0
10110100_0000000000000000100_00000 // CBZ X0, SUCCESS // Should be taken (+4). pc = 216
1001000100_000000000001_00010_00010 // ADDI X2, X2, #1 // X2 = 4 (increment delay slot counter)
// ERROR4:
000101_00000000000000000000000000 // B ERROR4 // Loop forever (0).
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop.
// SUCCESS:
1001000100_000001100011_11111_10010 // ADDI X18, X31, #99 // Show that we did finish.
// HALT:
000101_00000000000000000000000000 // B HALT // Done (0).
1001000100_000000000000_11111_11111 // ADDI X31, X31, #0 // Noop.