Parallel FFTs.

This commit is contained in:
Sean Bowe 2017-05-14 19:37:02 -06:00
parent 669853de99
commit bb73258ce9
4 changed files with 167 additions and 52 deletions

@ -44,6 +44,9 @@ macro_rules! curve_impl {
} }
impl Group<$engine> for $name { impl Group<$engine> for $name {
fn group_zero(e: &$engine) -> $name {
$name::zero(e)
}
fn group_mul_assign(&mut self, e: &$engine, scalar: &$scalarfield) { fn group_mul_assign(&mut self, e: &$engine, scalar: &$scalarfield) {
self.mul_assign(e, scalar); self.mul_assign(e, scalar);
} }

@ -97,6 +97,9 @@ fp_impl!(
); );
impl Group<Bls381> for Fr { impl Group<Bls381> for Fr {
fn group_zero(_: &Bls381) -> Fr {
Fr::zero()
}
fn group_mul_assign(&mut self, e: &Bls381, scalar: &Fr) { fn group_mul_assign(&mut self, e: &Bls381, scalar: &Fr) {
self.mul_assign(e, scalar); self.mul_assign(e, scalar);
} }

@ -9,7 +9,7 @@ use super::{Cow, Convert};
pub mod bls381; pub mod bls381;
pub trait Engine: Sized + Clone pub trait Engine: Sized + Clone + Send + Sync
{ {
type Fq: PrimeField<Self>; type Fq: PrimeField<Self>;
type Fr: SnarkField<Self>; type Fr: SnarkField<Self>;
@ -46,8 +46,9 @@ pub trait Engine: Sized + Clone
fn batch_baseexp<G: Curve<Self>, S: AsRef<[Self::Fr]>>(&self, table: &WindowTable<Self, G, Vec<G>>, scalars: S) -> Vec<G::Affine>; fn batch_baseexp<G: Curve<Self>, S: AsRef<[Self::Fr]>>(&self, table: &WindowTable<Self, G, Vec<G>>, scalars: S) -> Vec<G::Affine>;
} }
pub trait Group<E: Engine>: Copy pub trait Group<E: Engine>: Copy + Send + Sync + Sized
{ {
fn group_zero(&E) -> Self;
fn group_mul_assign(&mut self, &E, scalar: &E::Fr); fn group_mul_assign(&mut self, &E, scalar: &E::Fr);
fn group_add_assign(&mut self, &E, other: &Self); fn group_add_assign(&mut self, &E, other: &Self);
fn group_sub_assign(&mut self, &E, other: &Self); fn group_sub_assign(&mut self, &E, other: &Self);

@ -1,4 +1,6 @@
use curves::{Engine, Field, SnarkField, PrimeField, Group}; use curves::{Engine, Field, SnarkField, PrimeField, Group};
use crossbeam;
use num_cpus;
pub struct EvaluationDomain<E: Engine> { pub struct EvaluationDomain<E: Engine> {
pub m: u64, pub m: u64,
@ -49,19 +51,36 @@ impl<E: Engine> EvaluationDomain<E> {
pub fn ifft<T: Group<E>>(&self, e: &E, v: &mut [T]) pub fn ifft<T: Group<E>>(&self, e: &E, v: &mut [T])
{ {
assert!(v.len() == self.m as usize); assert!(v.len() == self.m as usize);
self._fft(e, v, &self.omegainv); parallel_fft(e, v, &self.omegainv, self.exp);
let chunk = (v.len() / num_cpus::get()) + 1;
crossbeam::scope(|scope| {
for v in v.chunks_mut(chunk) {
scope.spawn(move || {
for v in v { for v in v {
v.group_mul_assign(e, &self.minv); v.group_mul_assign(e, &self.minv);
} }
});
}
});
} }
fn mul_coset(&self, e: &E, v: &mut [E::Fr], g: &E::Fr) fn mul_coset(&self, e: &E, v: &mut [E::Fr], g: &E::Fr)
{ {
let mut u = *g; let chunk = (v.len() / num_cpus::get()) + 1;
for v in v.iter_mut().skip(1) {
crossbeam::scope(|scope| {
for (i, v) in v.chunks_mut(chunk).enumerate() {
scope.spawn(move || {
let mut u = g.pow(e, &[(i * chunk) as u64]);
for v in v.iter_mut() {
v.mul_assign(e, &u); v.mul_assign(e, &u);
u.mul_assign(e, g); u.mul_assign(e, g);
} }
});
}
});
} }
pub fn coset_fft(&self, e: &E, v: &mut [E::Fr]) pub fn coset_fft(&self, e: &E, v: &mut [E::Fr])
@ -79,18 +98,76 @@ impl<E: Engine> EvaluationDomain<E> {
pub fn divide_by_z_on_coset(&self, e: &E, v: &mut [E::Fr]) pub fn divide_by_z_on_coset(&self, e: &E, v: &mut [E::Fr])
{ {
let i = self.z(e, &E::Fr::multiplicative_generator(e)).inverse(e).unwrap(); let i = self.z(e, &E::Fr::multiplicative_generator(e)).inverse(e).unwrap();
let chunk = (v.len() / num_cpus::get()) + 1;
crossbeam::scope(|scope| {
for v in v.chunks_mut(chunk) {
scope.spawn(move || {
for v in v { for v in v {
v.mul_assign(e, &i); v.mul_assign(e, &i);
} }
});
}
});
} }
pub fn fft<T: Group<E>>(&self, e: &E, a: &mut [T]) pub fn fft<T: Group<E>>(&self, e: &E, a: &mut [T])
{ {
self._fft(e, a, &self.omega); parallel_fft(e, a, &self.omega, self.exp);
}
}
fn parallel_fft<E: Engine, T: Group<E>>(e: &E, a: &mut [T], omega: &E::Fr, log_n: u64)
{
let log_cpus = get_log_cpus();
let num_cpus = 1 << log_cpus;
if log_n < log_cpus {
serial_fft(e, a, omega, log_n)
} else {
// Shuffle
let log_new_n = log_n - log_cpus;
let mut tmp = vec![vec![T::group_zero(e); 1 << log_new_n]; num_cpus];
let omega_num_cpus = omega.pow(e, &[num_cpus as u64]);
crossbeam::scope(|scope| {
let a = &*a;
for (j, tmp) in tmp.iter_mut().enumerate() {
scope.spawn(move || {
let omega_j = omega.pow(e, &[j as u64]);
let omega_step = omega.pow(e, &[(j as u64) << log_new_n]);
let mut elt = E::Fr::one(e);
for i in 0..(1 << log_new_n) {
for s in 0..num_cpus {
let idx = (i + (s << log_new_n)) % (1 << log_n);
let mut t = a[idx];
t.group_mul_assign(e, &elt);
tmp[i].group_add_assign(e, &t);
elt.mul_assign(e, &omega_step);
}
elt.mul_assign(e, &omega_j);
} }
fn _fft<T: Group<E>>(&self, e: &E, a: &mut [T], omega: &E::Fr) serial_fft(e, tmp, &omega_num_cpus, log_new_n);
{ });
}
});
// TODO: parallelize
// Unshuffle
for i in 0..num_cpus {
for j in 0..(1 << log_new_n) {
a[(j << log_cpus) + i] = tmp[i][j];
}
}
}
}
fn serial_fft<E: Engine, T: Group<E>>(e: &E, a: &mut [T], omega: &E::Fr, log_n: u64)
{
fn bitreverse(mut n: usize, l: u64) -> usize { fn bitreverse(mut n: usize, l: u64) -> usize {
let mut r = 0; let mut r = 0;
for _ in 0..l { for _ in 0..l {
@ -100,8 +177,11 @@ impl<E: Engine> EvaluationDomain<E> {
r r
} }
for k in 0..(self.m as usize) { let n = a.len();
let rk = bitreverse(k, self.exp); assert_eq!(n, 1 << log_n);
for k in 0..n {
let rk = bitreverse(k, log_n);
if k < rk { if k < rk {
let tmp1 = a[rk]; let tmp1 = a[rk];
let tmp2 = a[k]; let tmp2 = a[k];
@ -111,11 +191,11 @@ impl<E: Engine> EvaluationDomain<E> {
} }
let mut m = 1; let mut m = 1;
for _ in 0..self.exp { for _ in 0..log_n {
let w_m = omega.pow(e, &[(self.m / (2*m)) as u64]); let w_m = omega.pow(e, &[(n / (2*m)) as u64]);
let mut k = 0; let mut k = 0;
while k < self.m { while k < n {
let mut w = E::Fr::one(e); let mut w = E::Fr::one(e);
for j in 0..m { for j in 0..m {
let mut t = a[(k+j+m) as usize]; let mut t = a[(k+j+m) as usize];
@ -132,7 +212,6 @@ impl<E: Engine> EvaluationDomain<E> {
m *= 2; m *= 2;
} }
}
} }
// Test multiplying various (low degree) polynomials together and // Test multiplying various (low degree) polynomials together and
@ -190,3 +269,32 @@ fn polynomial_arith() {
test_mul(e, rng); test_mul(e, rng);
} }
fn get_log_cpus() -> u64 {
let num = num_cpus::get();
log2_floor(num)
}
fn log2_floor(num: usize) -> u64 {
assert!(num > 0);
let mut pow = 0;
while (1 << (pow+1)) <= num {
pow += 1;
}
pow
}
#[test]
fn test_log2_floor() {
assert_eq!(log2_floor(1), 0);
assert_eq!(log2_floor(2), 1);
assert_eq!(log2_floor(3), 1);
assert_eq!(log2_floor(4), 2);
assert_eq!(log2_floor(5), 2);
assert_eq!(log2_floor(6), 2);
assert_eq!(log2_floor(7), 2);
assert_eq!(log2_floor(8), 3);
}