Multiplication is repeated addition with shifts — exactly like the
long-multiplication you learned in school, but in base 2, where each
"digit product" is just an AND. The design computes a * b two ways:
Combinationally, with assign product = a * b;. On any modern FPGA
this maps to a hardened DSP block (DSP48 on AMD, DSP on Intel/Lattice) —
a real multiplier circuit that's faster and cheaper than anything built
from LUTs. As with adders: write * and let the tools place it.
Sequentially, with a shift-and-add state machine that processes one
bit of b per clock: if the current low bit of b is 1, add the
(shifted) a into the accumulator. It needs N clocks but only one
adder — the classic area-vs-time trade, still relevant when you need
many low-rate multiplies and can't afford DSPs for each.
Watch the sequential unit in the waveform: after start, busy rises,
the accumulator builds up over 4 clocks, then done pulses with the same
answer the combinational multiplier produced instantly.
The width rule: multiplying N-bit by M-bit needs N+M bits —
4x4 -> 8. Truncating a product without thinking is the classic DSP-path
bug; decide explicitly which bits you keep (see the
fixed-point converter for how Q-formats track this).
Experiment: make the sequential multiplier 8x8, or change it to skip
runs of zero bits in b and count how many cycles typical inputs save.
The design
Verilog — design.v
// Combinational (DSP) multiply next to a shift-and-add sequential one.
module multipliers (
input wire clk,
input wire rst,
input wire start,
input wire [3:0] a, b,
output wire [7:0] product_comb, // instant: uses a DSP block
output reg [7:0] product_seq, // 4 clocks: one adder
output reg busy,
output reg done
);
assign product_comb = a * b;
reg [7:0] acc, addend;
reg [3:0] multiplier;
reg [2:0] count;
always @(posedge clk) begin
done <= 1'b0;
if (rst) begin
busy <= 1'b0;
product_seq <= 8'd0;
end else if (start && !busy) begin
busy <= 1'b1;
acc <= 8'd0;
addend <= {4'd0, a};
multiplier <= b;
count <= 3'd0;
end else if (busy) begin
if (multiplier[0])
acc <= acc + addend;
addend <= addend << 1;
multiplier <= multiplier >> 1;
count <= count + 3'd1;
if (count == 3'd3) begin
busy <= 1'b0;
done <= 1'b1;
product_seq <= multiplier[0] ? acc + addend : acc;
end
end
end
endmodule
Show the VHDL version
VHDL — design.vhd
-- Combinational multiply next to a shift-and-add sequential one.
library ieee;
use ieee.std_logic_1164.all;
use ieee.numeric_std.all;
entity multipliers is
port (
clk, rst, start : in std_logic;
a, b : in unsigned(3 downto 0);
product_comb : out unsigned(7 downto 0);
product_seq : out unsigned(7 downto 0);
busy, done : out std_logic
);
end entity;
architecture rtl of multipliers is
signal acc, addend : unsigned(7 downto 0);
signal m : unsigned(3 downto 0);
signal count : unsigned(2 downto 0);
signal busy_i : std_logic := '0';
begin
product_comb <= a * b;
busy <= busy_i;
process (clk) begin
if rising_edge(clk) then
done <= '0';
if rst = '1' then
busy_i <= '0';
product_seq <= (others => '0');
elsif start = '1' and busy_i = '0' then
busy_i <= '1';
acc <= (others => '0');
addend <= "0000" & a;
m <= b;
count <= (others => '0');
elsif busy_i = '1' then
if m(0) = '1' then
acc <= acc + addend;
end if;
addend <= shift_left(addend, 1);
m <= shift_right(m, 1);
count <= count + 1;
if count = 3 then
busy_i <= '0';
done <= '1';
if m(0) = '1' then
product_seq <= acc + addend;
else
product_seq <= acc;
end if;
end if;
end if;
end if;
end process;
end architecture;