pub fn square(x: u64) -> u64
x * x shorthand — same cost as mul(x, x), kept separate for readability in the S-box.
x * x
mul(x, x)