From cad06d86cd309074fffb5cce9d1f3b79b40f3891 Mon Sep 17 00:00:00 2001 From: jjsuperpower Date: Sun, 21 Aug 2022 22:33:56 -0500 Subject: basic alu coded and tested --- .gitignore | 4 +- .vscode/.gitignore | 1 + doc/ASAP32-ISA.md | 229 +++++++++++++++++++++++++++++++++++++++++++++++++ hdl/core.py | 247 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 469 insertions(+), 12 deletions(-) create mode 100644 .vscode/.gitignore create mode 100644 doc/ASAP32-ISA.md diff --git a/.gitignore b/.gitignore index 1642c9d..ec81623 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,6 @@ __pycache__/ *.vcd *.cc *.bak -*.pytest_cache \ No newline at end of file +*.pytest_cache +*.pyc +*.log \ No newline at end of file diff --git a/.vscode/.gitignore b/.vscode/.gitignore new file mode 100644 index 0000000..bf0824e --- /dev/null +++ b/.vscode/.gitignore @@ -0,0 +1 @@ +*.log \ No newline at end of file diff --git a/doc/ASAP32-ISA.md b/doc/ASAP32-ISA.md new file mode 100644 index 0000000..b5ed60d --- /dev/null +++ b/doc/ASAP32-ISA.md @@ -0,0 +1,229 @@ +# Vertex KISS 32 - Machine Code Spec +^ So Vertex is already a name for an FPGA, should we change this? +I propose a different name: + + ASAP Soc 32 a KISS inspired architecture + + As + Simple + As + Possible + +## General Instruction Format + X = HEX + B = BIN + + d = HEX (not used / don't care) + + MAX INSTRUCTIIONS = 256 + ALL INSTRUCTIONS ARE ATOMIC + + ALL MEMORY ADDRESSES ARE 32-bit, not 8-bit + ^ What do you think about this? 32 bits is usually and 'int' in C + This would extend the address space, kinda + + Also I have been reading about caching, I think the instruction width needs to be the same as the data memory + +### C-Type, Control + XX X X XXXX + Opcode d RS1 IMM + +### I-Type, Immediate + XX X X XXXX + Opcode RD RS1 IMM + +### R-Type, Arithmetic + XX X X X XXX + Opcode RD RS1 RS2 ddd + +### JR-Type, Compare and Jump + XX X X X XXX + Opcode Jump Condition RS1 RS2 ddd + +### JI-Type, Compare and Jump + XX X X XXXX + Opcode Jump Condition RS1 IMM + +## Registers + Maximum registers = 16 + Register width = 32 + All are R/W except 0X + + 0X Always Zero + AX GP-0 + BX GP-1 + CX GP-2 + DX GP-3 + EX GP-4 + FX GP-5 + GX GP-6 + HX GP-7 + HI Mult/Div Hi + LO Mult/Div Lo + FG Processor Flags + CR Control register (Writable only in supervisor mode) + IP Instruction Pointer + SP Stack Pointer + JA Jump Address + + ^ I have added this, it is part of my proposal for changing how jumps work + + +### FG Flag Register Bitfield + These registers are Read/Write + They are automaticaly writen two by the processor + + FG[0] Carry + FG[1] Overflow + FG[2] Zero + FG[3] Sign + FG[4-31] RESERVED + +### CR Control Register Bitfield + These register are Read/Write in System mode, Read Only in User Mode + + CR[0] Interupt Enable + CR[1] User Mode + CR[2] DMA + CR[2-31] RESERVED + + + +## Integer Instructions + +### R-Type + ADD RD, RS1, RS2 RD = RS1 + RS2 + SUB RD, RS1, RS2 RD = RS1 - RS2 + XOR RD, RS1, RS2 RD = RS1 ^ RS2 + OR RD, RS1, RS2 RD = RS1 | RS2 + AND RD, RS1, RS2 RD = RS1 & RS2 + LSL RD, RS1, RS2 RD = RS1 << RS2 (logical) + LSR RD, RS1, RS2 RD = RS1 >> RS2 (logical) + ASR RD, RS1, RS2 RD = RS1 >> RS2 + MUL RD, RS1, RS2 HI,LO = RS1 * RS2 + MULU RD, RS1, RS2 HI,LO = RS1 * RS2 + DIV RD, RS1, RS2 HI,LO = RS1 / RS2 + DIVU RD, RS1, RS2 HI,LO = RS1 / RS2 (unsigned) + + #LDB RD, RS1, RS2 RD = &(RS1 + RS2) Load Byte + #STB RD, RS1, RS2 &(RS1 + RS2) = (RD >> 24) Store Byte + LDW RD, RS1, RS2 RD = &(RS1 + RS2) Load Word (4 bytes) + STW RD, RS1, RS2 &(RS1 + RS2) = RD Store Word (4 bytes) + + # Depricated? + + +### I-Type + ADDI RD, RS, IMM RD = RS + IMM + SUBI RD, RS, IMM RD = RS - IMM + XORI RD, RS, IMM RD = RS ^ IMM + ORI RD, RS, IMM RD = RS | IMM + ANDI RD, RS, IMM RD = RS & IMM + LSLI RD, RS, IMM RD = RS << IMM (logical) + LSRI RD, RS, IMM RD = RS >> IMM (logical) + ASRI RD, RS, IMM RD = RS >> IMM + MULI dd, RS, IMM HI,LO = RS * IMM + MULIU dd, RS, IMM HI,LO = RS * IMM + DIVI dd, RS, IMM HI,LO = RS / IMM + DIVIU dd, RS, IMM HI,LO = RS / IMM (unsigned) + + #LDBI RD, RS, RS2 RD = &(RS + IMM) Load Byte + #STBI RD, RS, RS2 &(RS + IMM) = (RD >> 24) Store Byte + LDWI RD, RS, RS2 RD = &(RS + IMM) Load Word (4 bytes) + STWI RD, RS, RS2 &(RS + IMM) = RD Store Word (4 bytes) + + # Depricated? + + +### JR Instructions + Compare and then jump (IP = JMP) + + JMP 0 if (True) + JMP 1 if (RS1 != RS2) + JMP 2 if (RS1 == RS2) + JMP 3 if (RS1 > RS2) Unsigned + JMP 4 if (RS1 >= RS2) Unsigned + + JMP C if (RS1 > RS2) Signed + JMP D if (RS1 >= RS2) Signed + +### JI Instructions + Compare and then jump (IP = JMP) + + JMPI 0 if (True) + JMPI 1 if (RS1 != IMM) + JMPI 2 if (RS1 == IMM) + JMPI 3 if (RS1 > IMM) Unsigned + JMPI 4 if (RS1 >= IMM) Unsigned + + JMPI C if (RS1 > IMM) Signed + JMPI D if (RS1 >= IMM) Signed + +### Jump Aliases + + JEQ + JLT + JGT + JLE + JGE + JLTU + JGTU + JLEU + JGEU + + +### Control Instructions + NOP Do nothing -> opcode = ZERO + PUSHR RS SP+=1 ;*SP = RS + POPR RS RS = *SP ;SP-=1 + PUSHA PUSHR AX, BX, CX, DX, EX, FX, GX, FX, HI, LO, FG, CR, IP, SP, JA + POPA POP reverse PUSHR, SP not affected + PUSHI IMM SP+=1 ;*SP = IMM + INVP IMM Invalidate entry in TLB + RET POPR IP; + CALL PUSHR IP; IP = JMP; + INT PUSHA ;IP = IDT[IMM] + IRET POPR FLG; POPR SP; POPR IP + SIF Set interrupt flag + CIF Clear interrupt flag + +## Interrupt Descriptor Table +This will be in a fixed memory location, this will contain pointers to the interupt function. Once an interupt is entered, all interupts are turned off. + + IDT[0] Divide-by-zero exception + IDT[1] Hardware error (NMI) + IDT[2] Overflow + IDT[3] Invalid Opcode + IDT[4] General-protection fault + IDT[5] TLB miss + IDT[6] Software interrupt (reserved for OS) + IDT[7-31] Platform interrupts (PIC, hard drive, keyboard, etc.) + + IDTMSK[0-31] Interupt mask, when interupt is entered the mask bit for the coorisponding interupt will be disabled. + The software is responsible for renabling the mask bit + +You get 32 :) +Also, I was think of making the OS handle TLP misses + + + + + +## Page Directory + +The page directory contains 1024 page tables that have 1024 entries. + +^ Stupid question: Do we need a page directory? Also I have a very limited size for cache, idt, tlb, etc. Plan on having around 100 Kbits + +### Page table layout + + PT[0] Present + PT[1] R/W + PT[2] User-mode + PT[3-4] RESERVED + PT[5] Accessed + PT[6-7] RESERVED + PT[8-31] Physical address of page table (XX * 2^16 + XXXX) + + *This is still WIP but I wanted to get your input on the layout. I also have + the jank memory offset that will more than likely change.* diff --git a/hdl/core.py b/hdl/core.py index 6ef02fb..a75b92e 100644 --- a/hdl/core.py +++ b/hdl/core.py @@ -3,29 +3,254 @@ from amaranth.sim import Simulator, Settle, Delay from utils import cmd -class Template(Elaboratable): + +# class Reg(Elaboratable): +# def __init__(self): +# self.rd_addr = Signal(4) +# self.rs1_addr = Signal(4) +# self.rs2_addr = Signal(4) + +# self.rd = Signal(32) +# self.rs1 = Signal(32) +# self.rs2 = Signal(32) + +# self.zx = Signal(32) +# self.ax = Signal(32) +# self.bx = Signal(32) +# self.bx = Signal(32) +# self.cx = Signal(32) +# self.dx = Signal(32) +# self.ex = Signal(32) +# self.fx = Signal(32) +# self.gx = Signal(32) +# self.hx = Signal(32) +# self.hi = Signal(32) +# self.lo = Signal(32) +# self.fg = Signal(32) +# self.cr = Signal(32) +# self.ip = Signal(32) +# self.sp = Signal(32) +# self.ja = Signal(32) + +# self.reg_ar = Array([self.zx, self.ax, self.bx, self.cx, self.dx, self.ex, self.fx, self.gx, self.hx, self.hi, self.lo, self.fg, self.cr, self.ip, self.sp, self.ja]) + +# # TODO: add support for storing multiplication result +# self.ports = [self.rd_addr, self.rs1_addr, self.rs2_addr, self.rd, self.rs1, self.rs2, self.ip] + +# def elaborate(self, platform=None): +# m = Module() + +# with m.If(self.rd_addr != 0): +# m.d.sync += self.reg_ar[self.rd_addr].eq(self.rd) + +# m.d.comb += self.rs1.eq(self.reg_ar[self.rs1_addr]) +# m.d.comb += self.rs2.eq(self.reg_ar[self.rs2_addr]) + +# return m + + +# class ASAP32Core(Elaboratable): +# def __init__(self): +# self.interupt_msk = Signal(32) +# self.interupt_addr = Signal(32) +# self.interupt_en = Signal(1) +# self.interupt_sig = Signal(1) + +# self.jump = Signal(1) +# self.instruction_addr = Signal(32) + +# self.ports = [] + +# def elaborate(self, platform=None): +# m = Module() + +# m.submodules.reg = reg = Reg() + +# # interupt setup +# m.d.comb += self.interupt_en.eq(reg.cr[0]) + +# # get instruction address, account for jumps and interupts +# m.d.sync += self.instruction_addr.eq(Mux(self.interupt_en & self.interupt_sig, self.interupt_addr, Mux(self.jump, reg.ja, reg.ip))) + +# # update program counter +# m.d.sync += reg.ip.eq(self.instruction_addr + 1) + +# return m + +class ALU(Elaboratable): def __init__(self): - ... + self.in1 = Signal(32) + self.in2 = Signal(32) + self.out = Signal(32) + self.op = Signal(4) - self.ports = [...] + self.tmp = Signal(33) + self.signed_op = Signal(1) - def elaborate(self, platform): + self.carry = Signal(1) + self.overflow = Signal(1) + self.zero = Signal(1) + self.sign = Signal(1) + + self.ports = [self.in1, self.in2, self.out, self.op] + + def elaborate(self, platform=None): m = Module() - ... + # dummy sync for simulation only + if platform is None: + dumb = Signal() + m.d.sync += dumb.eq(~dumb) + with m.Switch(self.op): + with m.Case(0b0000): + m.d.comb += self.tmp.eq(self.in1 + self.in2) + with m.Case(0b0010): + m.d.comb += self.tmp.eq(self.in1.as_signed() + self.in2.as_signed()) + m.d.comb += self.signed_op.eq(1) + with m.Case(0b0001): + m.d.comb += self.tmp.eq(self.in1 - self.in2) + with m.Case(0b0011): + m.d.comb += self.tmp.eq(self.in1.as_signed() - self.in2.as_signed()) + m.d.comb += self.signed_op.eq(1) + with m.Case(4): + m.d.comb += self.tmp.eq(Cat(self.in1 & self.in2, 0)) + with m.Case(5): + m.d.comb += self.tmp.eq(Cat(self.in1 | self.in2, 0)) + with m.Case(6): + m.d.comb += self.tmp.eq(Cat(self.in1 ^ self.in2, 0)) + with m.Case(7): + m.d.comb += self.tmp.eq(Cat(self.in1 << self.in2[0:5], 0)) + with m.Case(8): + m.d.comb += self.tmp.eq(Cat(self.in1 >> self.in2[0:5], 0)) + with m.Case(9): + m.d.comb += self.tmp.eq(Cat(self.in1.as_signed() >> self.in2[0:5], 0)) + with m.Case(): + m.d.comb += self.signed_op.eq(0) + m.d.comb += self.tmp.eq(0) + + m.d.comb += self.carry.eq(self.tmp[32]) + m.d.comb += self.overflow.eq(self.tmp[32] ^ self.tmp[31]) + m.d.comb += self.sign.eq(self.tmp.as_signed() < 0) + m.d.comb += self.zero.eq(self.out == 0) + + m.d.comb += self.out.eq(self.tmp[0:32]) + return m +def test_alu(filename="alu.vcd"): + dut = ALU() + def proc1(): + def sub_proc(val1, val2): + yield dut.in1.eq(val1) + yield dut.in2.eq(val2) + yield + yield Settle() + + # test unsigned addition + yield dut.op.eq(0b0000) + yield from sub_proc(27, 13) + out = yield dut.out + assert 27 + 13 == (out), f'ERROR: {out} != {27 + 13}' + + # test signed addition + yield dut.op.eq(0b0010) + yield from sub_proc(-11, 43) + out = yield dut.out.as_signed() + assert -11 + 43 == out, f'ERROR: {out} != {-11 + 43}' + # test unsigned subtraction + yield dut.op.eq(0b0001) + yield from sub_proc(25, 13) + out = yield dut.out + assert 25 - 13 == out, f'ERROR: {out} != {25 - 13}' + # test signed subtraction + yield dut.op.eq(0b0011) + yield from sub_proc(25, -13) + out = yield dut.out.as_signed() + assert 25 + 13 == out, f'ERROR: {out} != {25 + 13}' -def test(filename="out.vcd"): - dut = ... + # test unsigned logical and + yield dut.op.eq(4) + yield from sub_proc(0b10101011, 0b01010101) + out = yield dut.out + assert 0b00000001 == out, f'ERROR: {out} != {0b00000001}' - def proc1(): - ... + # test unsigned logical or + yield dut.op.eq(5) + yield from sub_proc(0b10101011, 0b01000101) + out = yield dut.out + assert 0b11101111 == out, f'ERROR: {out} != {0b11101111}' + + # test logical xor + yield dut.op.eq(6) + yield from sub_proc(0b10001011, 0b01000101) + out = yield dut.out + assert 0b11001110 == out, f'ERROR: {out} != {0b11001110}' + # test logical shift left + yield dut.op.eq(7) + yield from sub_proc(0b10001011, 5) # shift left by 5 + out = yield dut.out + assert 0b1000101100000 == out, f'ERROR: {out} != {0b1000101100000}' + + # test logical shift right + yield dut.op.eq(8) + yield from sub_proc(0b10001011, 5) # shift right by 5 + out = yield dut.out + assert 0b100 == out, f'ERROR: {out} != {0b100}' + + # test aligned shift right + yield dut.op.eq(9) + yield from sub_proc(0x80001234, 4) # shift right by 4 + out = yield dut.out + assert 0xF8000123 == out, f'ERROR: {out} != {0xF8000123}' + + # test unsigned overflow + yield dut.op.eq(0b0000) + yield from sub_proc(0xFFFFFFFF, 1) # add 1 to 0xFFFFFFFF + out = yield dut.overflow + assert out == 1, f'ERROR: {out} != {1}' + out = yield dut.carry + assert out == 1, f'ERROR: {out} != {1}' + + # test signed overflow + yield dut.op.eq(0b0010) + yield from sub_proc(0x7FFFFFFF, 1) # add 1 to 0x7FFFFFFF + out = yield dut.overflow + assert out == 1, f'ERROR: {out} != {1}' + out = yield dut.carry + assert out == 0, f'ERROR: {out} != {0}' + + # test unsigned underflow + yield dut.op.eq(0b0001) + yield from sub_proc(0, -1) # subtract 1 from 0 + out = yield dut.overflow + assert out == 1, f'ERROR: {out} != {1}' + out = yield dut.carry + assert out == 1, f'ERROR: {out} != {1}' + + # test signed underflow + yield dut.op.eq(0b0010) + yield from sub_proc(0x80000000, -1) # sub 1 from 0x80000000 (most negative number in two's complement) + assert out == 1, f'ERROR: {out} != {1}' + out = yield dut.carry + assert out == 1, f'ERROR: {out} != {1}' + + # test zero + yield dut.op.eq(0b0000) + yield from sub_proc(0, 0) # add 0 to 0 + out = yield dut.zero + assert out == 1, f'ERROR: {out} != {1}' + + # test zero + yield dut.op.eq(0b0000) + yield from sub_proc(0, 1) # add 0 to 0 + out = yield dut.zero + assert out == 0, f'ERROR: {out} != {0}' + sim = Simulator(dut) sim.add_clock(1e-6) @@ -36,5 +261,5 @@ def test(filename="out.vcd"): if __name__ == '__main__': - shift_reg = Template(...) - cmd(shift_reg, test) \ No newline at end of file + hdl = ALU() + cmd(hdl, test_alu) -- cgit v1.2.3