|
| void | radix_sort_count_zero_entries (uint64_t *keys, size_t num_entries, uint32_t shift, size_t &num_zero_entries, uint32_t bucket_index_bits, const uint64_t *top_level_keys) noexcept |
| | Recursive MSD radix sort that also counts entries with zero bucket index.
|
| |
| size_t | sort_point_schedule_and_count_zero_buckets (uint64_t *point_schedule, size_t num_entries, uint32_t bucket_index_bits) noexcept |
| | Sort point schedule by bucket index and count zero-bucket entries.
|
| |
| bool | use_legacy_msm () noexcept |
| |
| template<typename Curve > |
| Curve::Element | pippenger (PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool handle_edge_cases, bool dedup_hint) noexcept |
| |
| template<typename Curve > |
| Curve::Element | pippenger_unsafe (PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool dedup_hint) noexcept |
| |
| template curve::BN254::Element | pippenger< curve::BN254 > (PolynomialSpan< const curve::BN254::ScalarField > scalars, std::span< const curve::BN254::AffineElement > points, bool handle_edge_cases, bool dedup_hint) noexcept |
| |
| template curve::Grumpkin::Element | pippenger< curve::Grumpkin > (PolynomialSpan< const curve::Grumpkin::ScalarField > scalars, std::span< const curve::Grumpkin::AffineElement > points, bool handle_edge_cases, bool dedup_hint) noexcept |
| |
| template curve::BN254::Element | pippenger_unsafe< curve::BN254 > (PolynomialSpan< const curve::BN254::ScalarField > scalars, std::span< const curve::BN254::AffineElement > points, bool dedup_hint) noexcept |
| |
| template curve::Grumpkin::Element | pippenger_unsafe< curve::Grumpkin > (PolynomialSpan< const curve::Grumpkin::ScalarField > scalars, std::span< const curve::Grumpkin::AffineElement > points, bool dedup_hint) noexcept |
| |
| size_t | window_bits_tuning_oversub_factor (size_t n_input) |
| | N-dependent oversubscription factor used ONLY for choose_window_bits' target_load formula (not for actual thread dispatch).
|
| |
| template<typename Curve > |
| size_t | compute_arena_bytes_for_msm (size_t n_input, bool external_glv_provided, bool dedup_active) noexcept |
| | Round-parallel Pippenger MSM_fast. Windows process sequentially (high-to-low) but each window is fully parallel across threads. Windows are processed in batches of windows_in_batch to amortise parallel_for barriers; the batch count is sized at runtime to fit BATCH_MEM_BUDGET (~32 MiB).
|
| |
| template<typename Curve > |
| Curve::Element | pippenger_round_parallel (PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool dedup_hint=false, std::span< const typename Curve::AffineElement > external_glv_doubled={}, std::span< std::byte > external_arena={}) noexcept |
| | State of the art pippenger_fast multiscalar multiplication algorithm.
|
| |
| template<typename Curve > |
| Curve::Element | pippenger_unsafe_fast (PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool dedup_hint) noexcept |
| |
| template<typename Curve > |
| Curve::Element | pippenger_fast (PolynomialSpan< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, bool handle_edge_cases, bool dedup_hint) noexcept |
| |
| template curve::BN254::Element | pippenger_unsafe_fast< curve::BN254 > (PolynomialSpan< const curve::BN254::ScalarField > scalars, std::span< const curve::BN254::AffineElement > points, bool dedup_hint) noexcept |
| |
| template curve::Grumpkin::Element | pippenger_unsafe_fast< curve::Grumpkin > (PolynomialSpan< const curve::Grumpkin::ScalarField > scalars, std::span< const curve::Grumpkin::AffineElement > points, bool dedup_hint) noexcept |
| |
| template curve::BN254::Element | pippenger_fast< curve::BN254 > (PolynomialSpan< const curve::BN254::ScalarField > scalars, std::span< const curve::BN254::AffineElement > points, bool handle_edge_cases, bool dedup_hint) noexcept |
| |
| template curve::Grumpkin::Element | pippenger_fast< curve::Grumpkin > (PolynomialSpan< const curve::Grumpkin::ScalarField > scalars, std::span< const curve::Grumpkin::AffineElement > points, bool handle_edge_cases, bool dedup_hint) noexcept |
| |
| template curve::BN254::Element | pippenger_round_parallel< curve::BN254 > (PolynomialSpan< const curve::BN254::ScalarField > scalars, std::span< const curve::BN254::AffineElement > points, bool dedup_hint, std::span< const curve::BN254::AffineElement > external_glv_doubled, std::span< std::byte > external_arena) noexcept |
| |
| template curve::Grumpkin::Element | pippenger_round_parallel< curve::Grumpkin > (PolynomialSpan< const curve::Grumpkin::ScalarField > scalars, std::span< const curve::Grumpkin::AffineElement > points, bool dedup_hint, std::span< const curve::Grumpkin::AffineElement > external_glv_doubled, std::span< std::byte > external_arena) noexcept |
| |
| template curve::BN254::Element | trivial_msm< curve::BN254 > (PolynomialSpan< const curve::BN254::ScalarField > scalars_span, std::span< const curve::BN254::AffineElement > all_points) noexcept |
| |
| template curve::Grumpkin::Element | trivial_msm< curve::Grumpkin > (PolynomialSpan< const curve::Grumpkin::ScalarField > scalars_span, std::span< const curve::Grumpkin::AffineElement > all_points) noexcept |
| |
| template curve::BN254::Element | trivial_msm_threaded< curve::BN254 > (PolynomialSpan< const curve::BN254::ScalarField > scalars_span, std::span< const curve::BN254::AffineElement > all_points) noexcept |
| |
| template curve::Grumpkin::Element | trivial_msm_threaded< curve::Grumpkin > (PolynomialSpan< const curve::Grumpkin::ScalarField > scalars_span, std::span< const curve::Grumpkin::AffineElement > all_points) noexcept |
| |
| template size_t | compute_arena_bytes_for_msm< curve::BN254 > (size_t, bool, bool) noexcept |
| |
| template<typename Curve > |
| Curve::Element | trivial_msm (PolynomialSpan< const typename Curve::ScalarField > scalars_span, std::span< const typename Curve::AffineElement > all_points) noexcept |
| | Single-threaded small-MSM_fast driver: Element::straus_msm over the input slice.
|
| |
| template<typename Curve > |
| Curve::Element | trivial_msm_threaded (PolynomialSpan< const typename Curve::ScalarField > scalars_span, std::span< const typename Curve::AffineElement > all_points) noexcept |
| | Multi-threaded small-MSM_fast driver: parallel Element::straus_msm over zero-skipped input slices.
|
| |
template<typename
Curve >
State of the art pippenger_fast multiscalar multiplication algorithm.
A traditional pippenger_fast N/logN algorithm (split scalars into windows, use window value to map scalar's base point into a bucket. Accumulate buckets. Repeat for all windows (1 window = 1 round)) We add the following optimizations on top of the algorithm: 1) Efficient multithreading via round-parallelism. If memory budget allows each thread evaluates multiple rounds. Thread efficiency ~90% when measured. 2) Booth in-place recoding of scalars. Each slice represents [-(num_buckets/2- 1), ..., (num_buckets/2 - 1)]. halves number of buckets. 3) GLV decompositon of input scalars into two half-size scalars. Halves number of bucket accumulation steps. Only used for n<2^{16} due to memory throughput tradeoffs. 4) Adaptive bucket range. When mixing large and small scalars (e.g. witness values), low bit-ranges use a larger bucket-range. Larger bit-ranges use a bucket-range tuned to the reduced number of large scalars. 5) Duplicate stripping. Witness commitments and permutation polynomials contain large numbers of duplicates. When dedup flag is active these are detected and base points consolidated prior to main MSM_fast. 6) Batch-affine arithmetic. When accumulating points into buckets, independent point additions are gathered and batched, allowing for affine point arithmetic w. batch invert. Cost = 6M for addition, 7M for doubling (M=field mult). 7) Batch-affine bucket accumulation. Uses novel technique from (TODO REF) to accumulate buckets in runs of independent additions. 8) If n is small (num points per thread < ~24), fallback to optimised multithreaded Straus MSM_fast (windowed double-and-add with GLV endomorphism)
In order to efficiently utilize memory and prevent WASM memory fragmentation, we use a single arena memory buffer to allocate all temporary data structures. Currently sized at 36MB which defines the upper cap on the memory consumed by this algorihtm.
- Parameters
-
| scalars | Input scalars |
| points | Input points |
| dedup_hint | Activates duplicate stripping if true. Off by default as dup stripping adds ~5% overhead on random inputs |
- Precondition
- Inputs are linearly independent: no point-at-infinity, no equal-x within a bucket (matches
pippenger_unsafe_fast).
- Note
- Scalars are converted out of Montgomery form internally and restored before return.
Definition at line 1264 of file scalar_multiplication_fast.cpp.