Skip to main content

bench_goldilocks_poseidon2/
field.rs

1// SPDX-License-Identifier: Apache-2.0 OR MIT
2// Copyright (C) Rostro Foundation
3
4//! Goldilocks field arithmetic — `F_p` with `p = 2^64 - 2^32 + 1`.
5//!
6//! Values are stored as `u64` and may be in `[0, 2^64)` (non-canonical) —
7//! the operations preserve correctness mod p but don't canonicalize on
8//! every step (canonicalization only at output via [`canonical`]).
9//! This matches Plonky3's `Goldilocks` representation.
10
11pub const P: u64 = 0xFFFF_FFFF_0000_0001;
12
13/// `2^64 - p = 2^32 - 1`. The "wrap-around correction" used in additions
14/// and the `* EPSILON` step of the reduction.
15const EPSILON: u64 = 0xFFFF_FFFF;
16
17pub const ZERO: u64 = 0;
18pub const ONE: u64 = 1;
19
20/// Add a + b mod p. Result may be non-canonical (in `[0, 2^64)`).
21#[inline(always)]
22pub fn add(a: u64, b: u64) -> u64 {
23    let (r, c) = a.overflowing_add(b);
24    if c {
25        // (a+b) wrapped past 2^64. Equivalent: result is r + (2^64 - p) = r + EPSILON.
26        // This may itself wrap past 2^64 if r is very large, but the
27        // double-wrap result is still correct mod p (and stays in u64).
28        r.wrapping_add(EPSILON)
29    } else {
30        r
31    }
32}
33
34/// Subtract a - b mod p. Result may be non-canonical.
35#[inline(always)]
36pub fn sub(a: u64, b: u64) -> u64 {
37    let (r, c) = a.overflowing_sub(b);
38    if c {
39        // a < b. Result is r + p (mod 2^64) = r - EPSILON.
40        r.wrapping_sub(EPSILON)
41    } else {
42        r
43    }
44}
45
46/// Multiply a * b mod p. Result may be non-canonical.
47#[inline(always)]
48pub fn mul(a: u64, b: u64) -> u64 {
49    reduce128((a as u128) * (b as u128))
50}
51
52/// Reduce a 128-bit value mod p (Goldilocks fast reduction).
53///
54/// Decomposition: `x = lo + hi * 2^64`, with `hi = hi_hi * 2^32 + hi_lo`.
55/// Modular identities:
56///   `2^64  ≡ 2^32 - 1   (mod p)`  → `hi_lo * 2^64 ≡ hi_lo * (2^32 - 1)`
57///   `2^96  ≡ -1         (mod p)`  → `hi_hi * 2^96 ≡ -hi_hi`
58///
59/// So `x mod p = lo - hi_hi + hi_lo * (2^32 - 1)`.
60#[inline(always)]
61fn reduce128(x: u128) -> u64 {
62    let lo = x as u64;
63    let hi = (x >> 64) as u64;
64    let hi_hi = hi >> 32;
65    let hi_lo = hi & EPSILON;
66
67    let (t0, borrow) = lo.overflowing_sub(hi_hi);
68    let t0 = if borrow { t0.wrapping_sub(EPSILON) } else { t0 };
69
70    let t1 = hi_lo.wrapping_mul(EPSILON);
71    let (r, carry) = t0.overflowing_add(t1);
72    if carry {
73        r.wrapping_add(EPSILON)
74    } else {
75        r
76    }
77}
78
79/// Canonicalize: return the unique representative in `[0, p)`.
80#[inline(always)]
81pub fn canonical(a: u64) -> u64 {
82    if a >= P {
83        a.wrapping_sub(P)
84    } else {
85        a
86    }
87}
88
89/// `x * x` shorthand — same cost as `mul(x, x)`, kept separate for
90/// readability in the S-box.
91#[inline(always)]
92pub fn square(x: u64) -> u64 {
93    mul(x, x)
94}
95
96/// 2 * x via `add(x, x)` — keeps the implementation in pure field ops
97/// rather than relying on the storage representation.
98#[inline(always)]
99pub fn double(x: u64) -> u64 {
100    add(x, x)
101}
102
103/// `base ^ exp` mod p via right-to-left binary square-and-multiply.
104///
105/// Used by [`inv`] for Fermat's-little-theorem inversion. Naïve
106/// algorithm (~64 squares + popcount(exp) muls) — Plonky3 uses a
107/// faster addition chain for inversion specifically, but for a
108/// benchmark of "how each VM handles exponentiation hot loops" this
109/// is what we want: predictable branchy dependent multiplies,
110/// exactly what the unoptimized algorithm looks like.
111#[inline]
112pub fn pow(base: u64, exp: u64) -> u64 {
113    if exp == 0 {
114        return ONE;
115    }
116    let mut result = ONE;
117    let mut b = base;
118    let mut e = exp;
119    while e > 0 {
120        if e & 1 == 1 {
121            result = mul(result, b);
122        }
123        b = mul(b, b);
124        e >>= 1;
125    }
126    result
127}
128
129/// Field inverse via Fermat's little theorem: `x^(p-2) mod p`.
130///
131/// `p - 2 = 0xFFFF_FFFE_FFFF_FFFF` — popcount = 63, so ~63 muls + ~64
132/// squares per inversion. Heavy: a single inverse costs ~127 mul-
133/// shaped ops. Montgomery's batch trick amortizes this to ~3 muls
134/// per inverse over N elements.
135#[inline]
136pub fn inv(x: u64) -> u64 {
137    pow(x, P - 2)
138}