Multipliers

Multiplication is repeated addition with shifts — exactly like the long-multiplication you learned in school, but in base 2, where each "digit product" is just an AND. The design computes a * b two ways:

Combinationally, with assign product = a * b;. On any modern FPGA this maps to a hardened DSP block (DSP48 on AMD, DSP on Intel/Lattice) — a real multiplier circuit that's faster and cheaper than anything built from LUTs. As with adders: write * and let the tools place it.
Sequentially, with a shift-and-add state machine that processes one bit of b per clock: if the current low bit of b is 1, add the (shifted) a into the accumulator. It needs N clocks but only one adder — the classic area-vs-time trade, still relevant when you need many low-rate multiplies and can't afford DSPs for each.

Watch the sequential unit in the waveform: after start, busy rises, the accumulator builds up over 4 clocks, then done pulses with the same answer the combinational multiplier produced instantly.

The width rule: multiplying N-bit by M-bit needs N+M bits — 4x4 -> 8. Truncating a product without thinking is the classic DSP-path bug; decide explicitly which bits you keep (see the fixed-point converter for how Q-formats track this).

Experiment: make the sequential multiplier 8x8, or change it to skip runs of zero bits in b and count how many cycles typical inputs save.

The design

Verilog — design.v

// Combinational (DSP) multiply next to a shift-and-add sequential one.
module multipliers (
    input  wire       clk,
    input  wire       rst,
    input  wire       start,
    input  wire [3:0] a, b,
    output wire [7:0] product_comb,   // instant: uses a DSP block
    output reg  [7:0] product_seq,    // 4 clocks: one adder
    output reg        busy,
    output reg        done
);
    assign product_comb = a * b;

    reg [7:0] acc, addend;
    reg [3:0] multiplier;
    reg [2:0] count;

    always @(posedge clk) begin
        done <= 1'b0;
        if (rst) begin
            busy <= 1'b0;
            product_seq <= 8'd0;
        end else if (start && !busy) begin
            busy       <= 1'b1;
            acc        <= 8'd0;
            addend     <= {4'd0, a};
            multiplier <= b;
            count      <= 3'd0;
        end else if (busy) begin
            if (multiplier[0])
                acc <= acc + addend;
            addend     <= addend << 1;
            multiplier <= multiplier >> 1;
            count      <= count + 3'd1;
            if (count == 3'd3) begin
                busy        <= 1'b0;
                done        <= 1'b1;
                product_seq <= multiplier[0] ? acc + addend : acc;
            end
        end
    end
endmodule

Show the VHDL version

VHDL — design.vhd

-- Combinational multiply next to a shift-and-add sequential one.
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;

entity multipliers is
    port (
        clk, rst, start : in  std_logic;
        a, b            : in  unsigned(3 downto 0);
        product_comb    : out unsigned(7 downto 0);
        product_seq     : out unsigned(7 downto 0);
        busy, done      : out std_logic
    );
end entity;

architecture rtl of multipliers is
    signal acc, addend : unsigned(7 downto 0);
    signal m           : unsigned(3 downto 0);
    signal count       : unsigned(2 downto 0);
    signal busy_i      : std_logic := '0';
begin
    product_comb <= a * b;
    busy <= busy_i;

    process (clk) begin
        if rising_edge(clk) then
            done <= '0';
            if rst = '1' then
                busy_i <= '0';
                product_seq <= (others => '0');
            elsif start = '1' and busy_i = '0' then
                busy_i <= '1';
                acc    <= (others => '0');
                addend <= "0000" & a;
                m      <= b;
                count  <= (others => '0');
            elsif busy_i = '1' then
                if m(0) = '1' then
                    acc <= acc + addend;
                end if;
                addend <= shift_left(addend, 1);
                m      <= shift_right(m, 1);
                count  <= count + 1;
                if count = 3 then
                    busy_i <= '0';
                    done   <= '1';
                    if m(0) = '1' then
                        product_seq <= acc + addend;
                    else
                        product_seq <= acc;
                    end if;
                end if;
            end if;
        end if;
    end process;
end architecture;

The testbench

Verilog — tb.v

`timescale 1ns/1ns
module tb;
    reg clk = 0, rst = 1, start = 0;
    reg [3:0] a = 0, b = 0;
    wire [7:0] product_comb, product_seq;
    wire busy, done;

    multipliers dut (.clk(clk), .rst(rst), .start(start), .a(a), .b(b),
                     .product_comb(product_comb), .product_seq(product_seq),
                     .busy(busy), .done(done));

    always #5 clk = ~clk;

    task mul(input [3:0] x, input [3:0] y);
        begin
            @(negedge clk); a = x; b = y; start = 1;
            @(negedge clk); start = 0;
            wait (done); @(negedge clk);
        end
    endtask

    initial begin
        $dumpfile("wave.vcd"); $dumpvars(0, tb);
        #12 rst = 0;
        mul(4'd3,  4'd5);    // 15
        mul(4'd7,  4'd9);    // 63
        mul(4'hF,  4'hF);    // 225: max case
        #20 $finish;
    end
endmodule

Simulated waveform

This trace was produced by actually simulating the code above with Icarus Verilog.