- add some benches

- cleanup dependencies
- will work on WASM and GM17
This commit is contained in:
Alex Vlasov 2019-01-23 21:02:11 +03:00
parent 6e5cfe211f
commit e775b47d99
8 changed files with 144 additions and 178 deletions

1
.gitignore vendored

@ -1,2 +1,3 @@
target
Cargo.lock
pkg

@ -6,7 +6,10 @@ homepage = "https://github.com/matterinc/bellman"
license = "MIT/Apache-2.0"
name = "bellman"
repository = "https://github.com/matterinc/bellman"
version = "0.1.2"
version = "0.1.3"
[lib]
crate-type = ["cdylib", "lib", "staticlib"]
[dependencies]
rand = "0.4"
@ -18,8 +21,6 @@ crossbeam = "0.3"
pairing = { git = 'https://github.com/matterinc/pairing' }
byteorder = "1"
ff = { git = 'https://github.com/matterinc/ff', features = ["derive"] }
pbr = "1.0.1"
time = "0.1"
[features]
default = []

@ -509,3 +509,59 @@ fn parallel_fft_consistency() {
test_consistency::<Bls12, _>(rng);
}
#[test]
fn test_field_element_multiplication_bn256() {
use rand::{self, Rand};
use pairing::bn256::Bn256;
use pairing::bn256::Fr;
use num_cpus;
let cpus = num_cpus::get();
const SAMPLES: usize = 1 << 27;
let rng = &mut rand::thread_rng();
let v1 = (0..SAMPLES).map(|_| Scalar::<Bn256>(Fr::rand(rng))).collect::<Vec<_>>();
let v2 = (0..SAMPLES).map(|_| Scalar::<Bn256>(Fr::rand(rng))).collect::<Vec<_>>();
let mut v1 = EvaluationDomain::from_coeffs(v1).unwrap();
let v2 = EvaluationDomain::from_coeffs(v2).unwrap();
let pool = Worker::new();
let start = std::time::Instant::now();
v1.mul_assign(&pool, &v2);
let duration_ns = start.elapsed().as_nanos() as f64;
println!("Elapsed {} ns for {} samples", duration_ns, SAMPLES);
let time_per_sample = duration_ns/(SAMPLES as f64);
println!("Tested on {} samples on {} CPUs with {} ns per field element multiplication", SAMPLES, cpus, time_per_sample);
}
#[test]
fn test_fft_bn256() {
use rand::{self, Rand};
use pairing::bn256::Bn256;
use pairing::bn256::Fr;
use num_cpus;
let cpus = num_cpus::get();
const SAMPLES: usize = 1 << 27;
let rng = &mut rand::thread_rng();
let v1 = (0..SAMPLES).map(|_| Scalar::<Bn256>(Fr::rand(rng))).collect::<Vec<_>>();
let mut v1 = EvaluationDomain::from_coeffs(v1).unwrap();
let pool = Worker::new();
let start = std::time::Instant::now();
v1.ifft(&pool);
let duration_ns = start.elapsed().as_nanos() as f64;
println!("Elapsed {} ns for {} samples", duration_ns, SAMPLES);
let time_per_sample = duration_ns/(SAMPLES as f64);
println!("Tested on {} samples on {} CPUs with {} ns per field element multiplication", SAMPLES, cpus, time_per_sample);
}

@ -1,9 +1,5 @@
extern crate time;
use super::super::verbose_flag;
use self::time::PreciseTime;
use rand::Rng;
use std::sync::Arc;
@ -255,7 +251,9 @@ pub fn generate_parameters<E, C>(
{
// Compute powers of tau
if verbose {eprintln!("computing powers of tau...")};
let start = PreciseTime::now();
let start = std::time::Instant::now();
{
let powers_of_tau = powers_of_tau.as_mut();
worker.scope(powers_of_tau.len(), |scope, chunk| {
@ -272,14 +270,16 @@ pub fn generate_parameters<E, C>(
}
});
}
if verbose {eprintln!("powers of tau stage 1 done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0);};
if verbose {eprintln!("powers of tau stage 1 done in {} s", start.elapsed().as_millis() as f64 / 1000.0);};
// coeff = t(x) / delta
let mut coeff = powers_of_tau.z(&tau);
coeff.mul_assign(&delta_inverse);
if verbose {eprintln!("computing the H query with multiple threads...")};
let start = PreciseTime::now();
let start = std::time::Instant::now();
// Compute the H query with multiple threads
worker.scope(h.len(), |scope, chunk| {
for (h, p) in h.chunks_mut(chunk).zip(powers_of_tau.as_ref().chunks(chunk))
@ -302,17 +302,18 @@ pub fn generate_parameters<E, C>(
});
}
});
if verbose {eprintln!("computing the H query done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0);};
if verbose {eprintln!("computing the H query done in {} s", start.elapsed().as_millis() as f64 / 1000.0);};
}
if verbose {eprintln!("using inverse FFT to convert powers of tau to Lagrange coefficients...")};
let start = PreciseTime::now();
let start = std::time::Instant::now();
// Use inverse FFT to convert powers of tau to Lagrange coefficients
powers_of_tau.ifft(&worker);
let powers_of_tau = powers_of_tau.into_coeffs();
if verbose {eprintln!("powers of tau stage 2 done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0)};
if verbose {eprintln!("powers of tau stage 2 done in {} s", start.elapsed().as_millis() as f64 / 1000.0)};
let mut a = vec![E::G1::zero(); assembly.num_inputs + assembly.num_aux];
let mut b_g1 = vec![E::G1::zero(); assembly.num_inputs + assembly.num_aux];
@ -321,7 +322,7 @@ pub fn generate_parameters<E, C>(
let mut l = vec![E::G1::zero(); assembly.num_aux];
if verbose {eprintln!("evaluating polynomials...")};
let start = PreciseTime::now();
let start = std::time::Instant::now();
fn eval<E: Engine>(
// wNAF window tables
@ -474,7 +475,7 @@ pub fn generate_parameters<E, C>(
&worker
);
if verbose {eprintln!("evaluating polynomials done in {} s", start.to(PreciseTime::now()).num_milliseconds() as f64 / 1000.0);};
if verbose {eprintln!("evaluating polynomials done in {} s", start.elapsed().as_millis() as f64 / 1000.0);};
// Don't allow any elements be unconstrained, so that
// the L query is always fully dense.

@ -1,6 +1,3 @@
extern crate time;
use self::time::PreciseTime;
use super::super::verbose_flag;
use rand::Rng;
@ -173,7 +170,7 @@ impl<E:Engine> PreparedProver<E> {
let vk = params.get_vk(self.assignment.input_assignment.len())?;
let h_start = PreciseTime::now();
let start = std::time::Instant::now();
let h = {
let mut a = EvaluationDomain::from_coeffs(prover.a)?;
@ -209,10 +206,9 @@ impl<E:Engine> PreparedProver<E> {
multiexp(&worker, params.get_h(a.len())?, FullDensity, a)
};
let h_end = PreciseTime::now();
if verbose {eprintln!("{} seconds for prover for H evaluation", h_start.to(h_end))};
if verbose {eprintln!("{} seconds for prover for H evaluation", start.elapsed().as_secs())};
let points_start = PreciseTime::now();
let start = std::time::Instant::now();
// TODO: Check that difference in operations for different chunks is small
@ -283,8 +279,7 @@ impl<E:Engine> PreparedProver<E> {
g_c.add_assign(&h.wait()?);
g_c.add_assign(&l.wait()?);
let points_end = PreciseTime::now();
if verbose {eprintln!("{} seconds for prover for point multiplication", points_start.to(points_end))};
if verbose {eprintln!("{} seconds for prover for point multiplication", start.elapsed().as_secs())};
Ok(Proof {
a: g_a.into_affine(),
@ -437,7 +432,7 @@ pub fn create_proof<E, C, P: ParameterSource<E>>(
let vk = params.get_vk(prover.input_assignment.len())?;
let h_start = PreciseTime::now();
let start = std::time::Instant::now();
let h = {
let mut a = EvaluationDomain::from_coeffs(prover.a)?;
@ -473,10 +468,9 @@ pub fn create_proof<E, C, P: ParameterSource<E>>(
multiexp(&worker, params.get_h(a.len())?, FullDensity, a)
};
let h_end = PreciseTime::now();
if verbose {eprintln!("{} seconds for prover for H evaluation", h_start.to(h_end))};
if verbose {eprintln!("{} seconds for prover for H evaluation", start.elapsed().as_secs())};
let points_start = PreciseTime::now();
let start = std::time::Instant::now();
// TODO: Check that difference in operations for different chunks is small
@ -547,8 +541,7 @@ pub fn create_proof<E, C, P: ParameterSource<E>>(
g_c.add_assign(&h.wait()?);
g_c.add_assign(&l.wait()?);
let points_end = PreciseTime::now();
if verbose {eprintln!("{} seconds for prover for point multiplication", points_start.to(points_end))};
if verbose {eprintln!("{} seconds for prover for point multiplication", start.elapsed().as_secs())};
Ok(Proof {
a: g_a.into_affine(),

@ -14,7 +14,6 @@ pub mod multicore;
mod multiexp;
pub mod domain;
pub mod groth16;
pub mod progress_bar;
use pairing::{Engine};
use ff::Field;

@ -142,6 +142,36 @@ impl DensityTracker {
}
}
/// This genious piece of code works in the following way:
/// - choose `c` - the bit length of the region that one thread works on
/// - make `2^c - 1` buckets and initialize them with `G = infinity` (that's equivalent of zero)
/// - there is no bucket for "zero" cause it's not necessary
/// - go over the pairs `(base, scalar)`
/// - for each scalar calculate `scalar % 2^c` and add the base (without any multiplications!) to the
/// corresponding bucket
/// - at the end each bucket will have an accumulated value that should be multiplied by the corresponding factor
/// between `1` and `2^c - 1` to get the right value
/// - here comes the first trick - you don't need to do multiplications at all, just add all the buckets together
/// starting from the first one `(a + b + c + ...)` and than add to the first sum another sum of the form
/// `(b + c + d + ...)`, and than the third one `(c + d + ...)`, that will result in the proper prefactor infront of every
/// accumulator, without any multiplication operations at all
/// - that's of course not enough, so spawn the next thread
/// - this thread works with the same bit width `c`, but SKIPS lowers bits completely, so it actually takes values
/// in the form `(scalar >> c) % 2^c`, so works on the next region
/// - spawn more threads until you exhaust all the bit length
/// - you will get roughly `[bitlength / c] + 1` inaccumulators
/// - double the highest accumulator enough times, add to the next one, double the result, add the next accumulator, continue
///
/// Demo why it works:
/// ```
/// a * G + b * H = (a_2 * (2^c)^2 + a_1 * (2^c)^1 + a_0) * G + (b_2 * (2^c)^2 + b_1 * (2^c)^1 + b_0) * H
/// ```
/// - make buckets over `0` labeled coefficients
/// - make buckets over `1` labeled coefficients
/// - make buckets over `2` labeled coefficients
/// - accumulators over each set of buckets will have an implicit factor of `(2^c)^i`, so before summing thme up
/// "higher" accumulators must be doubled `c` times
///
fn multiexp_inner<Q, D, G, S>(
pool: &Worker,
bases: S,
@ -195,7 +225,7 @@ fn multiexp_inner<Q, D, G, S>(
} else {
// Place multiplication into the bucket: Separate s * P as
// (s/2^c) * P + (s mod 2^c) P
// First multiplication is c bits less, do one can do it,
// First multiplication is c bits less, so one can do it,
// sum results from different buckets and double it c times,
// then add with (s mod 2^c) P parts
let mut exp = exp;
@ -317,3 +347,34 @@ fn test_with_bls12() {
assert_eq!(naive, fast);
}
#[test]
fn test_speed_with_bn256() {
use rand::{self, Rand};
use pairing::bn256::Bn256;
use num_cpus;
let cpus = num_cpus::get();
const SAMPLES: usize = 1 << 22;
let rng = &mut rand::thread_rng();
let v = Arc::new((0..SAMPLES).map(|_| <Bn256 as ScalarEngine>::Fr::rand(rng).into_repr()).collect::<Vec<_>>());
let g = Arc::new((0..SAMPLES).map(|_| <Bn256 as Engine>::G1::rand(rng).into_affine()).collect::<Vec<_>>());
let pool = Worker::new();
let start = std::time::Instant::now();
let _fast = multiexp(
&pool,
(g, 0),
FullDensity,
v
).wait().unwrap();
let duration_ns = start.elapsed().as_nanos() as f64;
println!("Elapsed {} ns for {} samples", duration_ns, SAMPLES);
let time_per_sample = duration_ns/(SAMPLES as f64);
println!("Tested on {} samples on {} CPUs with {} ns per multiplication", SAMPLES, cpus, time_per_sample);
}

@ -1,146 +0,0 @@
extern crate time;
use std::io::{Write};
use std::sync::{
mpsc::{channel, Sender, Receiver},
Arc,
atomic::{AtomicUsize, Ordering}
};
use self::time::precise_time_ns;
use std::time::Duration;
static UPDATE_INTERVAL: u64 = 1000_000 * 1000; // ms
pub struct MultiBar {
n_workers: u64,
total: u64,
cur: u64,
prev: u64,
prev_time: u64,
total_elapsed: u64,
step: Arc<AtomicUsize>,
tx: Sender<u64>,
rx: Receiver<u64>,
}
pub struct ProgressBar {
//chunk: u64,
acc: u64,
step: Arc<AtomicUsize>,
tx: Option<Sender<u64>>,
}
/// Simple efficient thread-safe progress indicator
/// It follows the interface of [https://github.com/a8m/pb](https://github.com/a8m/pb)
impl MultiBar {
/// Create a new MultiBar for stdout
pub fn new() -> Self {
let (tx, rx) = channel();
Self{
n_workers: 0,
total: 0,
cur: 0,
prev: 0,
prev_time: precise_time_ns(),
total_elapsed: 0,
step: Arc::new(AtomicUsize::new(1)),
tx,
rx,
}
}
// Create a ProgressBar for a process of `total` steps
pub fn create_bar(&mut self, chunk: u64) -> ProgressBar {
self.n_workers += 1;
self.total += chunk;
//println!("step 0 of {}", chunk);
ProgressBar{
//chunk,
acc: 0,
tx: Some(Sender::clone(&self.tx)),
step: Arc::clone(&self.step),
}
}
/// Start listening for updates from ProgressBars in different threads
pub fn listen(&mut self) {
//println!("");
for d in &self.rx {
if d == 0 {
self.n_workers -= 1;
}
if self.n_workers == 0 {
break;
}
self.cur += d;
let processed = self.cur - self.prev;
if processed > self.step.load(Ordering::Acquire) as u64 * self.n_workers {
let now = time::precise_time_ns();
let elapsed = now - self.prev_time;
if elapsed > UPDATE_INTERVAL {
self.prev = self.cur;
self.prev_time = now;
self.total_elapsed += elapsed;
print!("\rprocessed {:2}%: {} of {}.", self.cur * 100 / self.total, self.cur, self.total);
let r = Duration::from_nanos((self.total - self.cur) * self.total_elapsed / self.cur).as_secs();
print!(" Remaining estimated: {} h {} min {} s", r / 3600, r % 3600 / 60, r % 60);
let new_step = (self.cur * UPDATE_INTERVAL / self.total_elapsed) / self.n_workers;
self.step.store(new_step as usize, Ordering::Release);
std::io::stdout().flush().unwrap();
}
}
}
println!("\rdone ");
}
}
impl ProgressBar {
/// Increment progress by `d` steps
pub fn add(&mut self, d: u64) {
self.acc += d;
if self.acc > (self.step.load(Ordering::Relaxed) as u64) {
if let Some(tx) = &self.tx {
tx.send(self.acc).unwrap();
}
self.acc = 0;
}
}
/// Finish the process
pub fn finish(&mut self) {
let tx = self.tx.take().unwrap();
tx.send(0).unwrap();
drop(tx);
}
}
#[test]
fn test_progress_display() {
let mut mb = MultiBar::new();
for _j in 1..=0 {
let mut pb = mb.create_bar(3600000);
std::thread::spawn(move || {
for _i in 0..3600000 {
std::thread::sleep(Duration::from_millis(1));
pb.add(1);
}
pb.finish();
});
};
//mb.listen();
}