4 files changed, 469 insertions, 12 deletions
diff --git a/.gitignore b/.gitignore
index 1642c9d..ec81623 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,6 @@ __pycache__/
 *.vcd
 *.cc
 *.bak
-*.pytest_cache
-\ No newline at end of file
+*.pytest_cache
+*.pyc
+*.log
+\ No newline at end of file
diff --git a/.vscode/.gitignore b/.vscode/.gitignore
new file mode 100644
index 0000000..bf0824e
--- /dev/null
+++ b/.vscode/.gitignore
@@ -0,0 +1 @@
+*.log
+\ No newline at end of file
diff --git a/doc/ASAP32-ISA.md b/doc/ASAP32-ISA.md
new file mode 100644
index 0000000..b5ed60d
--- /dev/null
+++ b/doc/ASAP32-ISA.md
@@ -0,0 +1,229 @@
+# Vertex KISS 32 - Machine Code Spec
+^ So Vertex is already a name for an FPGA, should we change this?
+I propose a different name:
+
+    ASAP Soc 32 a KISS inspired architecture
+
+    As
+    Simple
+    As
+    Possible
+
+## General Instruction Format
+    X = HEX
+    B = BIN
+
+    d = HEX (not used /  don't care)
+
+    MAX INSTRUCTIIONS = 256
+    ALL INSTRUCTIONS ARE ATOMIC
+
+    ALL MEMORY ADDRESSES ARE 32-bit, not 8-bit
+    ^ What do you think about this? 32 bits is usually and 'int' in C
+    This would extend the address space, kinda
+
+    Also I have been reading about caching, I think the instruction width needs to be the same as the data memory
+
+### C-Type, Control
+    XX       X    X       XXXX
+    Opcode   d    RS1     IMM
+
+### I-Type, Immediate
+    XX        X     X     XXXX
+    Opcode    RD    RS1   IMM
+
+### R-Type, Arithmetic
+    XX         X      X      X      XXX
+    Opcode     RD     RS1    RS2    ddd
+
+### JR-Type, Compare and Jump
+    XX          X                   X      X        XXX
+    Opcode      Jump Condition      RS1    RS2      ddd
+
+### JI-Type, Compare and Jump
+    XX          X                X     XXXX            
+    Opcode      Jump Condition   RS1   IMM
+
+## Registers
+    Maximum registers = 16
+    Register width = 32
+    All are R/W except 0X
+
+    0X      Always Zero
+    AX      GP-0
+    BX      GP-1
+    CX      GP-2
+    DX      GP-3
+    EX      GP-4
+    FX      GP-5
+    GX      GP-6
+    HX      GP-7
+    HI      Mult/Div Hi
+    LO      Mult/Div Lo
+    FG      Processor Flags
+    CR      Control register (Writable only in supervisor mode)
+    IP      Instruction Pointer
+    SP      Stack Pointer
+    JA      Jump Address
+
+    ^ I have added this, it is part of my proposal for changing how jumps work
+
+
+### FG Flag Register Bitfield
+    These registers are Read/Write
+    They are automaticaly writen two by the processor
+
+    FG[0]           Carry
+    FG[1]           Overflow
+    FG[2]           Zero
+    FG[3]           Sign
+    FG[4-31]        RESERVED
+
+### CR Control Register Bitfield
+    These register are Read/Write in System mode, Read Only in User Mode
+
+    CR[0]           Interupt Enable
+    CR[1]           User Mode
+    CR[2]           DMA
+    CR[2-31]        RESERVED
+
+
+
+## Integer Instructions
+
+### R-Type
+    ADD     RD, RS1, RS2        RD = RS1 + RS2
+    SUB     RD, RS1, RS2        RD = RS1 - RS2
+    XOR     RD, RS1, RS2        RD = RS1 ^ RS2
+    OR      RD, RS1, RS2        RD = RS1 | RS2
+    AND     RD, RS1, RS2        RD = RS1 & RS2
+    LSL     RD, RS1, RS2        RD = RS1 << RS2 (logical)
+    LSR     RD, RS1, RS2        RD = RS1 >> RS2 (logical)
+    ASR     RD, RS1, RS2        RD = RS1 >> RS2
+    MUL     RD, RS1, RS2     HI,LO = RS1 * RS2
+    MULU    RD, RS1, RS2     HI,LO = RS1 * RS2
+    DIV     RD, RS1, RS2     HI,LO = RS1 / RS2
+    DIVU    RD, RS1, RS2     HI,LO = RS1 / RS2 (unsigned)
+
+    #LDB     RD, RS1, RS2        RD = &(RS1 + RS2)           Load Byte
+    #STB     RD, RS1, RS2        &(RS1 + RS2) = (RD >> 24)   Store Byte
+    LDW     RD, RS1, RS2        RD = &(RS1 + RS2)           Load Word (4 bytes)
+    STW     RD, RS1, RS2        &(RS1 + RS2) = RD           Store Word (4 bytes)
+
+    # Depricated?
+
+
+### I-Type
+    ADDI    RD, RS, IMM         RD = RS + IMM
+    SUBI    RD, RS, IMM         RD = RS - IMM
+    XORI    RD, RS, IMM         RD = RS ^ IMM
+    ORI     RD, RS, IMM         RD = RS | IMM
+    ANDI    RD, RS, IMM         RD = RS & IMM
+    LSLI    RD, RS, IMM         RD = RS << IMM (logical)
+    LSRI    RD, RS, IMM         RD = RS >> IMM (logical)
+    ASRI    RD, RS, IMM         RD = RS >> IMM
+    MULI    dd, RS, IMM      HI,LO = RS * IMM
+    MULIU   dd, RS, IMM      HI,LO = RS * IMM
+    DIVI    dd, RS, IMM      HI,LO = RS / IMM
+    DIVIU   dd, RS, IMM      HI,LO = RS / IMM (unsigned)
+
+    #LDBI    RD, RS, RS2     RD = &(RS + IMM)           Load Byte
+    #STBI    RD, RS, RS2     &(RS + IMM) = (RD >> 24)   Store Byte
+    LDWI    RD, RS, RS2     RD = &(RS + IMM)           Load Word (4 bytes)
+    STWI    RD, RS, RS2     &(RS + IMM) = RD           Store Word (4 bytes)
+
+    # Depricated?
+
+
+### JR Instructions
+    Compare and then jump (IP = JMP)
+
+    JMP     0          if (True)
+    JMP     1          if (RS1 != RS2)
+    JMP     2          if (RS1 == RS2)
+    JMP     3          if (RS1 >  RS2)  Unsigned
+    JMP     4          if (RS1 >= RS2)  Unsigned
+
+    JMP     C          if (RS1 >  RS2)  Signed
+    JMP     D          if (RS1 >= RS2)  Signed
+
+### JI Instructions
+    Compare and then jump (IP = JMP)
+
+    JMPI     0          if (True)
+    JMPI     1          if (RS1 != IMM)
+    JMPI     2          if (RS1 == IMM)
+    JMPI     3          if (RS1 >  IMM)  Unsigned
+    JMPI     4          if (RS1 >= IMM)  Unsigned
+
+    JMPI     C          if (RS1 >  IMM)  Signed
+    JMPI     D          if (RS1 >= IMM)  Signed
+
+### Jump Aliases
+
+    JEQ
+    JLT     
+    JGT
+    JLE
+    JGE
+    JLTU 
+    JGTU
+    JLEU   
+    JGEU   
+
+
+### Control Instructions
+    NOP                     Do nothing -> opcode = ZERO
+    PUSHR   RS              SP+=1     ;*SP = RS
+    POPR    RS              RS = *SP  ;SP-=1
+    PUSHA                   PUSHR AX, BX, CX, DX, EX, FX, GX, FX, HI, LO, FG, CR, IP, SP, JA
+    POPA                    POP reverse PUSHR, SP not affected
+    PUSHI   IMM             SP+=1     ;*SP = IMM
+    INVP    IMM             Invalidate entry in TLB
+    RET                     POPR IP;
+    CALL                    PUSHR IP; IP = JMP;
+    INT                     PUSHA ;IP = IDT[IMM]
+    IRET                    POPR FLG; POPR SP; POPR IP
+    SIF                     Set interrupt flag
+    CIF                     Clear interrupt flag
+
+## Interrupt Descriptor Table
+This will be in a fixed memory location, this will contain pointers to the interupt function. Once an interupt is entered, all interupts are turned off.
+
+    IDT[0]          Divide-by-zero exception
+    IDT[1]          Hardware error (NMI)
+    IDT[2]          Overflow
+    IDT[3]          Invalid Opcode
+    IDT[4]          General-protection fault
+    IDT[5]          TLB miss
+    IDT[6]          Software interrupt (reserved for OS)
+    IDT[7-31]       Platform interrupts (PIC, hard drive, keyboard, etc.)
+
+    IDTMSK[0-31]    Interupt mask, when interupt is entered the mask bit for the coorisponding interupt will be disabled.
+                    The software is responsible for renabling the mask bit
+
+You get 32 :)
+Also, I was think of making the OS handle TLP misses
+
+
+
+
+
+## Page Directory
+
+The page directory contains 1024 page tables that have 1024 entries.
+
+^ Stupid question: Do we need a page directory? Also I have a very limited size for cache, idt, tlb, etc. Plan on having around 100 Kbits
+
+### Page table layout
+
+    PT[0]           Present
+    PT[1]           R/W
+    PT[2]           User-mode
+    PT[3-4]         RESERVED
+    PT[5]           Accessed
+    PT[6-7]         RESERVED
+    PT[8-31]        Physical address of page table (XX * 2^16 + XXXX)
+
+        *This is still WIP but I wanted to get your input on the layout. I also have
+        the jank memory offset that will more than likely change.*
diff --git a/hdl/core.py b/hdl/core.py
index 6ef02fb..a75b92e 100644
--- a/hdl/core.py
+++ b/hdl/core.py
@@ -3,29 +3,254 @@ from amaranth.sim import Simulator, Settle, Delay
 
 from utils import cmd
 
-class Template(Elaboratable):
+
+# class Reg(Elaboratable):
+#     def __init__(self):
+#         self.rd_addr = Signal(4)
+#         self.rs1_addr = Signal(4)
+#         self.rs2_addr = Signal(4)
+
+#         self.rd = Signal(32)
+#         self.rs1 = Signal(32)
+#         self.rs2 = Signal(32)
+
+#         self.zx = Signal(32)
+#         self.ax = Signal(32)
+#         self.bx = Signal(32)
+#         self.bx = Signal(32)
+#         self.cx = Signal(32)
+#         self.dx = Signal(32)
+#         self.ex = Signal(32)
+#         self.fx = Signal(32)
+#         self.gx = Signal(32)
+#         self.hx = Signal(32)
+#         self.hi = Signal(32)
+#         self.lo = Signal(32)
+#         self.fg = Signal(32)
+#         self.cr = Signal(32)
+#         self.ip = Signal(32)
+#         self.sp = Signal(32)
+#         self.ja = Signal(32)
+
+#         self.reg_ar = Array([self.zx, self.ax, self.bx, self.cx, self.dx, self.ex, self.fx, self.gx, self.hx, self.hi, self.lo, self.fg, self.cr, self.ip, self.sp, self.ja])
+
+#         # TODO: add support for storing multiplication result
+#         self.ports = [self.rd_addr, self.rs1_addr, self.rs2_addr, self.rd, self.rs1, self.rs2, self.ip]
+
+#     def elaborate(self, platform=None):
+#         m = Module()
+
+#         with m.If(self.rd_addr != 0):
+#             m.d.sync += self.reg_ar[self.rd_addr].eq(self.rd)
+
+#         m.d.comb += self.rs1.eq(self.reg_ar[self.rs1_addr])
+#         m.d.comb += self.rs2.eq(self.reg_ar[self.rs2_addr])
+
+#         return m
+
+
+# class ASAP32Core(Elaboratable):
+#     def __init__(self):
+#         self.interupt_msk = Signal(32)
+#         self.interupt_addr = Signal(32)
+#         self.interupt_en = Signal(1)
+#         self.interupt_sig = Signal(1)
+
+#         self.jump = Signal(1)
+#         self.instruction_addr = Signal(32)
+
+#         self.ports = []
+
+#     def elaborate(self, platform=None):
+#         m = Module()
+
+#         m.submodules.reg = reg = Reg()
+
+#         # interupt setup
+#         m.d.comb += self.interupt_en.eq(reg.cr[0])
+
+#         # get instruction address, account for jumps and interupts
+#         m.d.sync += self.instruction_addr.eq(Mux(self.interupt_en & self.interupt_sig, self.interupt_addr, Mux(self.jump, reg.ja, reg.ip)))
+
+#         # update program counter
+#         m.d.sync += reg.ip.eq(self.instruction_addr + 1)
+
+#         return m
+
+class ALU(Elaboratable):
     def __init__(self):
-        ...
+        self.in1 = Signal(32)
+        self.in2 = Signal(32)
+        self.out = Signal(32)
+        self.op = Signal(4)
 
-        self.ports = [...]
+        self.tmp = Signal(33)
+        self.signed_op = Signal(1)
 
-    def elaborate(self, platform):
+        self.carry = Signal(1)
+        self.overflow = Signal(1)
+        self.zero = Signal(1)
+        self.sign = Signal(1)
+
+        self.ports = [self.in1, self.in2, self.out, self.op]
+
+    def elaborate(self, platform=None):
         m = Module()
 
-        ...
+        # dummy sync for simulation only
+        if platform is None:
+            dumb = Signal()
+            m.d.sync += dumb.eq(~dumb)
 
+        with m.Switch(self.op):
+            with m.Case(0b0000):
+                m.d.comb += self.tmp.eq(self.in1 + self.in2)
+            with m.Case(0b0010):
+                m.d.comb += self.tmp.eq(self.in1.as_signed() + self.in2.as_signed())
+                m.d.comb += self.signed_op.eq(1)
+            with m.Case(0b0001):
+                m.d.comb += self.tmp.eq(self.in1 - self.in2)
+            with m.Case(0b0011):
+                m.d.comb += self.tmp.eq(self.in1.as_signed() - self.in2.as_signed())
+                m.d.comb += self.signed_op.eq(1)
+            with m.Case(4):
+                m.d.comb += self.tmp.eq(Cat(self.in1 & self.in2, 0))
+            with m.Case(5):
+                m.d.comb += self.tmp.eq(Cat(self.in1 | self.in2, 0))
+            with m.Case(6):
+                m.d.comb += self.tmp.eq(Cat(self.in1 ^ self.in2, 0))
+            with m.Case(7):
+                m.d.comb += self.tmp.eq(Cat(self.in1 << self.in2[0:5], 0))
+            with m.Case(8):
+                m.d.comb += self.tmp.eq(Cat(self.in1 >> self.in2[0:5], 0))
+            with m.Case(9):
+                m.d.comb += self.tmp.eq(Cat(self.in1.as_signed() >> self.in2[0:5], 0))
+            with m.Case():
+                m.d.comb += self.signed_op.eq(0)
+                m.d.comb += self.tmp.eq(0)
+        
+        m.d.comb += self.carry.eq(self.tmp[32])
+        m.d.comb += self.overflow.eq(self.tmp[32] ^ self.tmp[31])
+        m.d.comb += self.sign.eq(self.tmp.as_signed() < 0)
+        m.d.comb += self.zero.eq(self.out == 0)
+
+        m.d.comb += self.out.eq(self.tmp[0:32])
+        
         return m
 
+def test_alu(filename="alu.vcd"):
+    dut = ALU()
 
+    def proc1():
+        def sub_proc(val1, val2):
+            yield dut.in1.eq(val1)
+            yield dut.in2.eq(val2)
+            yield
+            yield Settle()
+
+        # test unsigned addition
+        yield dut.op.eq(0b0000)
+        yield from sub_proc(27, 13)
+        out = yield dut.out
+        assert 27 + 13 == (out), f'ERROR: {out} != {27 + 13}'
+
+        # test signed addition
+        yield dut.op.eq(0b0010)
+        yield from sub_proc(-11, 43)
+        out = yield dut.out.as_signed()
+        assert -11 + 43 == out, f'ERROR: {out} != {-11 + 43}'
 
+        # test unsigned subtraction
+        yield dut.op.eq(0b0001)
+        yield from sub_proc(25, 13)
+        out = yield dut.out
+        assert 25 - 13 == out, f'ERROR: {out} != {25 - 13}'
         
+        # test signed subtraction
+        yield dut.op.eq(0b0011)
+        yield from sub_proc(25, -13)
+        out = yield dut.out.as_signed()
+        assert 25 + 13 == out, f'ERROR: {out} != {25 + 13}'
 
-def test(filename="out.vcd"):
-    dut = ...
+        # test unsigned logical and
+        yield dut.op.eq(4)
+        yield from sub_proc(0b10101011, 0b01010101)
+        out = yield dut.out
+        assert 0b00000001 == out, f'ERROR: {out} != {0b00000001}'
 
-    def proc1():
-        ...
+        # test unsigned logical or
+        yield dut.op.eq(5)
+        yield from sub_proc(0b10101011, 0b01000101)
+        out = yield dut.out
+        assert 0b11101111 == out, f'ERROR: {out} != {0b11101111}'
+
+        # test logical xor
+        yield dut.op.eq(6)
+        yield from sub_proc(0b10001011, 0b01000101)
+        out = yield dut.out
+        assert 0b11001110 == out, f'ERROR: {out} != {0b11001110}'
 
+        # test logical shift left
+        yield dut.op.eq(7)
+        yield from sub_proc(0b10001011, 5) # shift left by 5
+        out = yield dut.out
+        assert 0b1000101100000 == out, f'ERROR: {out} != {0b1000101100000}'
+
+        # test logical shift right
+        yield dut.op.eq(8)
+        yield from sub_proc(0b10001011, 5) # shift right by 5
+        out = yield dut.out
+        assert 0b100 == out, f'ERROR: {out} != {0b100}'
+
+        # test aligned shift right
+        yield dut.op.eq(9)
+        yield from sub_proc(0x80001234, 4) # shift right by 4
+        out = yield dut.out
+        assert 0xF8000123 == out, f'ERROR: {out} != {0xF8000123}'
+
+        # test unsigned overflow
+        yield dut.op.eq(0b0000)
+        yield from sub_proc(0xFFFFFFFF, 1) # add 1 to 0xFFFFFFFF
+        out = yield dut.overflow
+        assert out == 1, f'ERROR: {out} != {1}'
+        out = yield dut.carry
+        assert out == 1, f'ERROR: {out} != {1}'
+
+        # test signed overflow
+        yield dut.op.eq(0b0010)
+        yield from sub_proc(0x7FFFFFFF, 1) # add 1 to 0x7FFFFFFF
+        out = yield dut.overflow
+        assert out == 1, f'ERROR: {out} != {1}'
+        out = yield dut.carry
+        assert out == 0, f'ERROR: {out} != {0}'
+
+        # test unsigned underflow
+        yield dut.op.eq(0b0001)
+        yield from sub_proc(0, -1) # subtract 1 from 0
+        out = yield dut.overflow
+        assert out == 1, f'ERROR: {out} != {1}'
+        out = yield dut.carry
+        assert out == 1, f'ERROR: {out} != {1}'
+
+        # test signed underflow
+        yield dut.op.eq(0b0010)
+        yield from sub_proc(0x80000000, -1) # sub 1 from 0x80000000 (most negative number in two's complement)
+        assert out == 1, f'ERROR: {out} != {1}'
+        out = yield dut.carry
+        assert out == 1, f'ERROR: {out} != {1}'
+
+        # test zero
+        yield dut.op.eq(0b0000)
+        yield from sub_proc(0, 0) # add 0 to 0
+        out = yield dut.zero
+        assert out == 1, f'ERROR: {out} != {1}'
+
+        # test zero
+        yield dut.op.eq(0b0000)
+        yield from sub_proc(0, 1) # add 0 to 0
+        out = yield dut.zero
+        assert out == 0, f'ERROR: {out} != {0}'
+        
 
     sim = Simulator(dut)
     sim.add_clock(1e-6)
@@ -36,5 +261,5 @@ def test(filename="out.vcd"):
 
 
 if __name__ == '__main__':
-    shift_reg = Template(...)
-    cmd(shift_reg, test)
-\ No newline at end of file
+    hdl = ALU()
+    cmd(hdl, test_alu)