diff --git a/sim/verilog6502_32bit_test.py b/sim/verilog6502_32bit_test.py
index 86cb441..fd9ad00 100644
--- a/sim/verilog6502_32bit_test.py
+++ b/sim/verilog6502_32bit_test.py
@@ -476,6 +476,70 @@ async def test_indirect_indexed(dut):
     await check_instruction_sequence(dut, expected_cpu_outputs)
 
 @cocotb.test
+async def test_jsr(dut):
+    cocotb.start_soon(Clock(dut.clk, CLK_PERIOD, unit="ns").start())
+    cocotb.start_soon(handle_memory(dut))
+
+    write_dword(0xfffffff4, 0x200)
+
+    # @0x200
+    # ldx #$0
+    # txs
+    # jsr $12345678
+    # wai
+    #
+    # @0x1234
+    # rts
+    write_bytes(0x200, [0xa2, 0xff, 0x9a, 0x20, 0x78, 0x56, 0x34, 0x12, 0xcb])
+    write_bytes(0x12345678, [0x60])
+
+    dut.RDY.value = Immediate(1)
+
+    dut.reset.value = Immediate(1)
+    for _ in range(10):
+        await RisingEdge(dut.clk)
+    dut.reset.value = 0
+
+    expected_cpu_outputs = [
+        None,   # ignore reset sequence
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        None,
+        (0x00000200, False, None),  # ldx #$00
+        (0x00000201, False, None),  # Immediate
+        (0x00000202, False, None),  # txs
+        (0x00000203, False, None),  # second cycle of txs
+        (0x00000203, False, None),  # jsr $12345678
+        (0x00000204, False, None),  # first byte of address
+        (0x000001ff, True,  0x00),  # 24-31
+        (0x000001fe, True,  0x00),  # 16-23
+        (0x000001fd, True,  0x02),  # 8-15
+        (0x000001fc, True,  0x05),  # 7-0
+        (0x00000205, False, None),  # second byte of address
+        (0x00000206, False, None),  # third byte of address
+        (0x00000207, False, None),  # fourth byte of address
+        (0x00000208, False, None),  # receive last byte of address
+        (0x12345678, False, None),  # rts
+        (0x12345679, False, None),  # rts
+        (0x000001fb, False, None),  # current stack while we add 1 to it
+        (0x000001fc, False, None),  # 7-0
+        (0x000001fd, False, None),  # 15-8
+        (0x000001fe, False, None),  # 23-16
+        (0x000001ff, False, None),  # 31-24
+        (0x1234567c, False, None),  # Updating PC before jump
+        (0x00000208, False, None),  # WAI
+        (0x00000209, False, None),  # second wai
+        (0x0000020a, False, None),  # third wai
+    ]
+
+    await check_instruction_sequence(dut, expected_cpu_outputs)
+
+# @cocotb.test
 async def test_adc(dut):
     cocotb.start_soon(Clock(dut.clk, CLK_PERIOD, unit="ns").start())
     cocotb.start_soon(handle_memory(dut))
diff --git a/src/cpu_65c02.v b/src/cpu_65c02.v
index 91e64a5..cdb2eb0 100644
--- a/src/cpu_65c02.v
+++ b/src/cpu_65c02.v
@@ -141,12 +141,15 @@ reg [6:0] state;
  * control signals
  */
 
-reg PC_inc;             // Increment PC
+reg [1:0] PC_inc;             // Increment PC
 reg [31:0] PC_temp;     // intermediate value of PC
 
 reg [1:0] src_reg;      // source register index
 reg [1:0] dst_reg;      // destination register index
 
+reg sr_sel;             // choose to load shift register from dimux or from alu
+reg alu_sr_enable;      // choose to shift or not
+
 reg index_y;            // if set, then Y is index reg rather than X
 reg load_reg;           // loading a register (A, X, Y, S) in this instruction
 reg inc;                // increment
@@ -201,6 +204,10 @@ parameter
         OP_ROL = 4'b1011,
         OP_A   = 4'b1111;
 
+parameter
+        SR_ALU = 1'b0,
+        SR_DI  = 1'b1;
+
 /*
  * Microcode state machine. Basically, every addressing mode has its own
  * path through the state machine. Additional information, such as the
@@ -238,7 +245,7 @@ parameter
     JSR0   = 7'd26, // JSR     - push PCH, save LSB, send S to ALU (-1)
     JSR1   = 7'd27, // JSR     - push PCL, send S to ALU (-1)
     JSR2   = 7'd28, // JSR     - write S
-    JSR3   = 7'd29, // JSR     - fetch MSB
+    JSR4   = 7'd29, // JSR     - fetch MSB
     PULL0  = 7'd30, // PLP/PLA/PLX/PLY - save next op in IRHOLD, send S to ALU (+1)
     PULL1  = 7'd31, // PLP/PLA/PLX/PLY - fetch data from stack, write S
     PULL2  = 7'd32, // PLP/PLA/PLX/PLY - prefetch op, but don't increment PC
@@ -254,7 +261,7 @@ parameter
     RTS0   = 7'd42, // RTS     - send S to ALU (+1)
     RTS1   = 7'd43, // RTS     - read PCL from stack
     RTS2   = 7'd44, // RTS     - write PCL to ALU, read PCH
-    RTS3   = 7'd45, // RTS     - load PC and increment
+    RTS5   = 7'd45, // RTS     - load PC and increment
     WRITE  = 7'd46, // Write memory for read/modify/write
     ZP0    = 7'd47, // Z-page  - fetch ZP address
     ZPX0   = 7'd48, // ZP, X   - fetch ZP, and send to ALU (+X)
@@ -279,7 +286,13 @@ parameter
     INDY3  = 7'd67, // (ZP),Y  - fetch at ZP+3, and send byte 2 to ALU (+Carry)
     INDY4  = 7'd68, // (ZP),Y  - fetch data, and send byte 3 to ALU (+Carry)
     INDX3  = 7'd69, // (ZP,X)  - fetch addr 2 at ZP+X+2
-    INDX4  = 7'd70; // (ZP,X)  - fetch addr 3 at ZP+X+3
+    INDX4  = 7'd70, // (ZP,X)  - fetch addr 3 at ZP+X+3
+    JSR3   = 7'd71,
+    JSR5   = 7'd73,
+    JSR6   = 7'd74,
+    JSR7   = 7'd75,
+    RTS3   = 7'd76,
+    RTS4   = 7'd77;
 
 
 `ifdef SIM
@@ -330,6 +343,10 @@ always @*
             JSR1:   statename = "JSR1";
             JSR2:   statename = "JSR2";
             JSR3:   statename = "JSR3";
+            JSR4:   statename = "JSR4";
+            JSR5:   statename = "JSR5";
+            JSR6:   statename = "JSR6";
+            JSR7:   statename = "JSR7";
             RTI0:   statename = "RTI0";
             RTI1:   statename = "RTI1";
             RTI2:   statename = "RTI2";
@@ -339,6 +356,8 @@ always @*
             RTS1:   statename = "RTS1";
             RTS2:   statename = "RTS2";
             RTS3:   statename = "RTS3";
+            RTS4:   statename = "RTS4";
+            RTS5:   statename = "RTS5";
             BRK0:   statename = "BRK0";
             BRK1:   statename = "BRK1";
             BRK2:   statename = "BRK2";
@@ -387,9 +406,11 @@ always @*
         JMP3,
         JMPI3,
         JMPIX3,
-        JSR3,
-        RTS3,
-        RTI4:           PC_temp = { DIMUX, ADD, alu_sr_0, alu_sr_1};
+        JSR7:           PC_temp = { DIMUX, ADD, alu_sr_0, alu_sr_1};
+
+        RTS5,
+        RTI4:           PC_temp = { DIMUX, ADD, alu_sr_0, alu_sr_1} + 2;
+
 
         BRA1:           PC_temp = { ABR[15:8], ADD };
 
@@ -407,10 +428,11 @@ always @*
  */
 always @*
     case( state )
-        DECODE:         if( (~I & IRQ) | NMI_edge )
+        DECODE:         if( (~I & IRQ) | NMI_edge ) begin
                             PC_inc = 0;
-                        else
+                        end else begin
                             PC_inc = 1;
+                        end
 
         ABS0,
         ABS1,
@@ -434,12 +456,17 @@ always @*
         JMP1,
         JMP2,
         JMP3,
+        JSR4,
+        JSR5,
+        JSR6,
         RTI4,
-        RTS3:           PC_inc = 1;
+        RTS3,
+        RTS4,
+        RTS5:           PC_inc = 1;
 
-        JMPIX3:         PC_inc = ~CO;       // Don't increment PC if we are going to go through JMPIX4
+        JMPIX3:         PC_inc = {1'b0, ~CO};       // Don't increment PC if we are going to go through JMPIX4
 
-        BRA1:           PC_inc = CO ^~ backwards;
+        BRA1:           PC_inc = {1'b0, CO ^~ backwards};
 
         default:        PC_inc = 0;
     endcase
@@ -483,9 +510,13 @@ always @*
 
         BRK1,
         JSR1,
+        JSR2,
+        JSR3,
         PULL1,
         RTS1,
         RTS2,
+        RTS3,
+        RTS4,
         RTI1,
         RTI2,
         RTI3,
@@ -612,11 +643,11 @@ always @*
         DECODE: write_register = load_reg & ~plp;
 
         PULL1,
-         RTS2,
+         RTS4,
          RTI3,
          BRK5,
          JSR0,
-         JSR2 : write_register = 1;
+         JSR4 : write_register = 1;
 
        default: write_register = 0;
     endcase
@@ -703,13 +734,15 @@ always @*
         BRK5,
         JSR0,
         JSR2,
+        JSR3,
+        JSR4,
         PULL0,
         PULL1,
         PUSH1,
         RTI0,
         RTI3,
         RTS0,
-        RTS2   : regsel = SEL_S;
+        RTS4   : regsel = SEL_S;
 
         default: regsel = src_reg;
     endcase
@@ -734,8 +767,30 @@ ALU ALU( .clk(clk),
          .RDY(RDY) );
 
 always @(posedge clk) begin
-    alu_sr_0 <= ADD;
-    alu_sr_1 <= alu_sr_0;
+    if (alu_sr_enable) begin
+        if (sr_sel == SR_ALU) begin
+            alu_sr_0 <= ADD;
+        end else begin
+            alu_sr_0 <= DIMUX;
+        end
+        alu_sr_1 <= alu_sr_0;
+    end
+end
+
+always @* begin
+    case ( state )
+        RTS2,
+        RTS3:       sr_sel = SR_DI;
+
+        default:    sr_sel = SR_ALU;
+    endcase
+end
+
+always @*begin
+    case ( state)
+        RTS4:       alu_sr_enable = 0;
+        default:    alu_sr_enable = 1;
+    endcase
 end
 
 /*
@@ -761,7 +816,9 @@ always @*
         BRK3,
         BRK4,
         JSR0,
-        JSR1:   alu_op = OP_SUB;
+        JSR1,
+        JSR2,
+        JSR3:   alu_op = OP_SUB;
 
      default:   alu_op = OP_ADD;
     endcase
@@ -791,7 +848,11 @@ always @(posedge clk)
 always @*
     case( state )
         JSR1,
+        JSR2,
+        JSR3,
         RTS1,
+        RTS2,
+        RTS3,
         RTI1,
         RTI2,
         BRK1,
@@ -806,8 +867,10 @@ always @*
         ABSX0,
         RTI0,
         RTS0,
+        RTS1,
+        RTS2,
         JSR0,
-        JSR2,
+        JSR4,
         BRK0,
         PULL0,
         INDY1,
@@ -836,6 +899,8 @@ always @*
     case( state )
          BRA1,
          RTS1,
+         RTS2,
+         RTS3,
          RTI0,
          RTI1,
          RTI2,
@@ -843,6 +908,8 @@ always @*
          JSR0,
          JSR1,
          JSR2,
+         JSR3,
+         JSR4,
          BRK0,
          BRK1,
          BRK2,
@@ -891,6 +958,8 @@ always @*
         RTI2,
         RTS0,
         RTS1,
+        RTS2,
+        RTS3,
         INDY0:  CI = 1;
 
         default:        CI = 0;
@@ -1140,7 +1209,11 @@ always @(posedge clk or posedge reset)
         JSR0    : state <= JSR1;
         JSR1    : state <= JSR2;
         JSR2    : state <= JSR3;
-        JSR3    : state <= FETCH;
+        JSR3    : state <= JSR4;
+        JSR4    : state <= JSR5;
+        JSR5    : state <= JSR6;
+        JSR6    : state <= JSR7;
+        JSR7    : state <= FETCH;
 
         RTI0    : state <= RTI1;
         RTI1    : state <= RTI2;
@@ -1151,7 +1224,9 @@ always @(posedge clk or posedge reset)
         RTS0    : state <= RTS1;
         RTS1    : state <= RTS2;
         RTS2    : state <= RTS3;
-        RTS3    : state <= FETCH;
+        RTS3    : state <= RTS4;
+        RTS4    : state <= RTS5;
+        RTS5    : state <= FETCH;
 
         BRA0    : state <= cond_true ? BRA1 : DECODE;
         BRA1    : state <= (CO ^ backwards) ? BRA2 : DECODE;